home *** CD-ROM | disk | FTP | other *** search
/ Computer Shopper 253 / Issue 253 - March 2009 - DPCS0309DVD.ISO / Toolkit / Internet / WinHTTrack / httrack-3.43.exe / {app} / src / htsparse.c < prev    next >
Encoding:
C/C++ Source or Header  |  1980-01-01  |  197.3 KB  |  4,591 lines

  1. /* ------------------------------------------------------------ */
  2. /*
  3. HTTrack Website Copier, Offline Browser for Windows and Unix
  4. Copyright (C) Xavier Roche and other contributors
  5.  
  6. This program is free software; you can redistribute it and/or
  7. modify it under the terms of the GNU General Public License
  8. as published by the Free Software Foundation; either version 2
  9. of the License, or any later version.
  10.  
  11. This program is distributed in the hope that it will be useful,
  12. but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14. GNU General Public License for more details.
  15.  
  16. You should have received a copy of the GNU General Public License
  17. along with this program; if not, write to the Free Software
  18. Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  19.  
  20.  
  21. Important notes:
  22.  
  23. - We hereby ask people using this source NOT to use it in purpose of grabbing
  24. emails addresses, or collecting any other private information on persons.
  25. This would disgrace our work, and spoil the many hours we spent on it.
  26.  
  27.  
  28. Please visit our Website: http://www.httrack.com
  29. */
  30.  
  31.  
  32. /* ------------------------------------------------------------ */
  33. /* File: htsparse.c parser                                      */
  34. /*       html/javascript/css parser                             */
  35. /*       and other parser routines                              */
  36. /* Author: Xavier Roche                                         */
  37. /* ------------------------------------------------------------ */
  38.  
  39.  
  40. /* Internal engine bytecode */
  41. #define HTS_INTERNAL_BYTECODE
  42.  
  43. #ifndef  _WIN32_WCE
  44. #include <fcntl.h>
  45. #endif
  46. #include <ctype.h>
  47.  
  48. /* File defs */
  49. #include "htscore.h"
  50.  
  51. /* specific definitions */
  52. #include "htsbase.h"
  53. #include "htsnet.h"
  54. #include "htsbauth.h"
  55. #include "htsmd5.h"
  56. #include "htsindex.h"
  57.  
  58. /* external modules */
  59. #include "htsmodules.h"
  60.  
  61. // htswrap_add
  62. #include "htswrap.h"
  63.  
  64. // parser
  65. #include "htsparse.h"
  66. #include "htsback.h"
  67.  
  68. // specific defines
  69. #define urladr   (liens[ptr]->adr)
  70. #define urlfil   (liens[ptr]->fil)
  71. #define savename (liens[ptr]->sav)
  72. #define parenturladr   (liens[liens[ptr]->precedent]->adr)
  73. #define parenturlfil   (liens[liens[ptr]->precedent]->fil)
  74. #define parentsavename (liens[liens[ptr]->precedent]->sav)
  75. #define relativeurladr   ((!parent_relative)?urladr:parenturladr)
  76. #define relativeurlfil   ((!parent_relative)?urlfil:parenturlfil)
  77. #define relativesavename ((!parent_relative)?savename:parentsavename)
  78.  
  79. #define test_flush if (opt->flush) { if (opt->log) { fflush(opt->log); } if (opt->log) { fflush(opt->log);  } }
  80.  
  81. // does nothing
  82. #define XH_uninit do {} while(0)
  83.  
  84. // version optimisΘe, qui permet de ne pas toucher aux html non modifiΘs (update)
  85. #define REALLOC_SIZE 8192
  86. #define HT_ADD_CHK(A) if (((int) (A)+ht_len+1) >= ht_size) { \
  87.   ht_size=(A)+ht_len+REALLOC_SIZE; \
  88.   ht_buff=(char*) realloct(ht_buff,ht_size); \
  89.   if (ht_buff==NULL) { \
  90.   printf("PANIC! : Not enough memory [%d]\n",__LINE__); \
  91.   XH_uninit; \
  92.   abortLogFmt("not enough memory for current html document in HT_ADD_CHK : realloct(%d) failed" _ ht_size); \
  93.   exit(1); \
  94.   } \
  95. } \
  96.   ht_len+=A;
  97. #define HT_ADD_ADR \
  98.   if ((opt->getmode & 1) && (ptr>0)) { \
  99.   size_t i = ((size_t) (adr - lastsaved)),j=ht_len; HT_ADD_CHK(i) \
  100.   memcpy(ht_buff+j, lastsaved, i); \
  101.   ht_buff[j+i]='\0'; \
  102.   lastsaved=adr; \
  103.   }
  104. #define HT_ADD(A) \
  105.   if ((opt->getmode & 1) && (ptr>0)) { \
  106.   size_t i_ = strlen(A), j_ = ht_len; \
  107.   if (i_) { \
  108.   HT_ADD_CHK(i_) \
  109.   memcpy(ht_buff+j_, A, i_); \
  110.   ht_buff[j_+i_]='\0'; \
  111.   } }
  112. #define HT_ADD_HTMLESCAPED(A) \
  113.   if ((opt->getmode & 1) && (ptr>0)) { \
  114.     size_t i_, j_; \
  115.     char BIGSTK tempo_[HTS_URLMAXSIZE*2]; \
  116.     escape_for_html_print(A, tempo_); \
  117.     i_=strlen(tempo_); \
  118.     j_=ht_len; \
  119.     if (i_) { \
  120.     HT_ADD_CHK(i_) \
  121.     memcpy(ht_buff+j_, tempo_, i_); \
  122.     ht_buff[j_+i_]='\0'; \
  123.   } }
  124. #define HT_ADD_HTMLESCAPED_FULL(A) \
  125.   if ((opt->getmode & 1) && (ptr>0)) { \
  126.     size_t i_, j_; \
  127.     char BIGSTK tempo_[HTS_URLMAXSIZE*2]; \
  128.     escape_for_html_print_full(A, tempo_); \
  129.     i_=strlen(tempo_); \
  130.     j_=ht_len; \
  131.     if (i_) { \
  132.     HT_ADD_CHK(i_) \
  133.     memcpy(ht_buff+j_, tempo_, i_); \
  134.     ht_buff[j_+i_]='\0'; \
  135.   } }
  136. #define HT_ADD_START \
  137.   size_t ht_size=(size_t)(r->size*5)/4+REALLOC_SIZE; \
  138.   size_t ht_len=0; \
  139.   char* ht_buff=NULL; \
  140.   if ((opt->getmode & 1) && (ptr>0)) { \
  141.   ht_buff=(char*) malloct(ht_size); \
  142.   if (ht_buff==NULL) { \
  143.   printf("PANIC! : Not enough memory [%d]\n",__LINE__); \
  144.   XH_uninit; \
  145.   abortLogFmt("not enough memory for current html document in HT_ADD_START : malloct(%d) failed" _ (int) ht_size); \
  146.   exit(1); \
  147.   } \
  148.   ht_buff[0]='\0'; \
  149.   }
  150. #define HT_ADD_END { \
  151.   int ok=0;\
  152.   if (ht_buff) { \
  153.     char digest[32+2];\
  154.     off_t fsize_old = fsize(fconv(OPT_GET_BUFF(opt),savename));\
  155.     digest[0]='\0';\
  156.     domd5mem(ht_buff,ht_len,digest,1);\
  157.     if (fsize_old==ht_len) { \
  158.       int mlen = 0;\
  159.       char* mbuff;\
  160.       cache_readdata(cache,"//[HTML-MD5]//",savename,&mbuff,&mlen);\
  161.       if (mlen) \
  162.         mbuff[mlen]='\0';\
  163.       if ((mlen == 32) && (strcmp(((mbuff!=NULL)?mbuff:""),digest)==0)) {\
  164.         ok=1;\
  165.         if ( (opt->debug>1) && (opt->log!=NULL) ) {\
  166.           HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"File not re-written (md5): %s"LF,savename);\
  167.           test_flush;\
  168.         }\
  169.       } else {\
  170.         ok=0;\
  171.       } \
  172.     }\
  173.     if (!ok) { \
  174.       file_notify(opt,urladr, urlfil, savename, 1, 1, r->notmodified); \
  175.       fp=filecreate(&opt->state.strc, savename); \
  176.       if (fp) { \
  177.         if (ht_len>0) {\
  178.         if (fwrite(ht_buff,1,ht_len,fp) != ht_len) { \
  179.           int fcheck;\
  180.           if ((fcheck=check_fatal_io_errno())) {\
  181.             opt->state.exit_xh=-1;\
  182.           }\
  183.           if (opt->log) {   \
  184.             int last_errno = errno; \
  185.             HTS_LOG(opt,LOG_ERROR); fprintf(opt->log,"Unable to write HTML file %s: %s"LF, savename, strerror(last_errno));\
  186.             if (fcheck) {\
  187.               HTS_LOG(opt,LOG_ERROR);\
  188.               fprintf(opt->log,"* * Fatal write error, giving up"LF);\
  189.             }\
  190.             test_flush;\
  191.           }\
  192.         }\
  193.         }\
  194.         fclose(fp); fp=NULL; \
  195.         if (strnotempty(r->lastmodified)) \
  196.         set_filetime_rfc822(savename,r->lastmodified); \
  197.       } else {\
  198.         int fcheck;\
  199.         if ((fcheck=check_fatal_io_errno())) {\
  200.                   HTS_LOG(opt,LOG_ERROR); fprintf(opt->log,"Mirror aborted: disk full or filesystem problems"LF); \
  201.                     test_flush; \
  202.           opt->state.exit_xh=-1;\
  203.         }\
  204.         if (opt->log) { \
  205.           int last_errno = errno; \
  206.           HTS_LOG(opt,LOG_ERROR);\
  207.           fprintf(opt->log,"Unable to save file %s : %s"LF, savename, strerror(last_errno));\
  208.           if (fcheck) {\
  209.             HTS_LOG(opt,LOG_ERROR);\
  210.             fprintf(opt->log,"* * Fatal write error, giving up"LF);\
  211.           }\
  212.           test_flush;\
  213.         }\
  214.       }\
  215.     } else {\
  216.       file_notify(opt,urladr, urlfil, savename, 0, 0, r->notmodified); \
  217.       filenote(&opt->state.strc, savename,NULL); \
  218.     }\
  219.     if (cache->ndx)\
  220.       cache_writedata(cache->ndx,cache->dat,"//[HTML-MD5]//",savename,digest,(int)strlen(digest));\
  221.   } \
  222.   freet(ht_buff); ht_buff=NULL; \
  223. }
  224. #define HT_ADD_FOP 
  225.  
  226. // COPY IN HTSCORE.C
  227. #define HT_INDEX_END do { \
  228.   if (!makeindex_done) { \
  229.   if (makeindex_fp) { \
  230.   char BIGSTK tempo[1024]; \
  231.   if (makeindex_links == 1) { \
  232.   sprintf(tempo,"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">"CRLF,makeindex_firstlink); \
  233.   } else \
  234.   tempo[0]='\0'; \
  235.   fprintf(makeindex_fp,template_footer, \
  236.   "<!-- Mirror and index made by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->", \
  237.   tempo \
  238.   ); \
  239.   fflush(makeindex_fp); \
  240.   fclose(makeindex_fp);  /* α ne pas oublier sinon on passe une nuit blanche */  \
  241.   makeindex_fp=NULL; \
  242.   usercommand(opt,0,NULL,fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_html),"index.html"),"primary","primary");  \
  243.   } \
  244.   } \
  245.   makeindex_done=1;    /* ok c'est fait */  \
  246. } while(0)
  247.  
  248. // Enregistrement d'un lien:
  249. // on calcule la taille nΘcessaire: taille des 3 chaεnes α stocker (taille forcΘe paire, plus 2 octets de sΘcuritΘ)
  250. // puis on vΘrifie qu'on a assez de marge dans le buffer - sinon on en rΘalloue un autre
  251. // enfin on Θcrit α l'adresse courante du buffer, qu'on incrΘmente. on dΘcrΘmente la taille dispo d'autant ensuite
  252. // codebase: si non nul et si .class stockee on le note pour chemin primaire pour classes
  253. // FA,FS: former_adr et former_fil, lien original
  254. #define liens_record_sav_len(A) 
  255.  
  256. // COPIE DE HTSCORE.C
  257.  
  258. #define liens_record(A,F,S,FA,FF) { \
  259.   int notecode=0; \
  260.   size_t lienurl_len=((sizeof(lien_url)+HTS_ALIGN-1)/HTS_ALIGN)*HTS_ALIGN,\
  261.   adr_len=strlen(A),\
  262.   fil_len=strlen(F),\
  263.   sav_len=strlen(S),\
  264.   cod_len=0,\
  265.   former_adr_len=strlen(FA),\
  266.   former_fil_len=strlen(FF); \
  267.   if (former_adr_len>0) {\
  268.     former_adr_len=(former_adr_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; \
  269.     former_fil_len=(former_fil_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; \
  270.   } else \
  271.     former_adr_len=former_fil_len=0;\
  272.   if (strlen(F)>6) if (strnotempty(codebase)) if (strfield(F+strlen(F)-6,".class")) {\
  273.     notecode=1; \
  274.     cod_len=strlen(codebase); \
  275.     cod_len=(cod_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; \
  276.   } \
  277.   adr_len=(adr_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; \
  278.   fil_len=(fil_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; \
  279.   sav_len=(sav_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; \
  280.   if ((int) lien_size < (int) (adr_len+fil_len+sav_len+cod_len+former_adr_len+former_fil_len+lienurl_len)) { \
  281.     lien_buffer=(char*) ((void*) calloct(add_tab_alloc,1)); \
  282.     lien_size=add_tab_alloc; \
  283.     if (lien_buffer!=NULL) { \
  284.     liens[lien_tot]=(lien_url*) (void*) lien_buffer; lien_buffer+=lienurl_len; lien_size-=lienurl_len; \
  285.     liens[lien_tot]->firstblock=1; \
  286.     } \
  287.   } else { \
  288.     liens[lien_tot]=(lien_url*) (void*) lien_buffer; lien_buffer+=lienurl_len; lien_size-=lienurl_len; \
  289.     liens[lien_tot]->firstblock=0; \
  290.   } \
  291.   if (liens[lien_tot]!=NULL) { \
  292.     liens[lien_tot]->adr=lien_buffer; lien_buffer+=adr_len; lien_size-=adr_len; \
  293.     liens[lien_tot]->fil=lien_buffer; lien_buffer+=fil_len; lien_size-=fil_len; \
  294.     liens[lien_tot]->sav=lien_buffer; lien_buffer+=sav_len; lien_size-=sav_len; \
  295.     liens[lien_tot]->cod=NULL; \
  296.     if (notecode) { \
  297.       liens[lien_tot]->cod=lien_buffer; \
  298.       lien_buffer+=cod_len; \
  299.       lien_size-=cod_len; \
  300.       strcpybuff(liens[lien_tot]->cod,codebase); \
  301.     } \
  302.     if (former_adr_len>0) {\
  303.       liens[lien_tot]->former_adr=lien_buffer; lien_buffer+=former_adr_len; lien_size-=former_adr_len; \
  304.       liens[lien_tot]->former_fil=lien_buffer; lien_buffer+=former_fil_len; lien_size-=former_fil_len; \
  305.       strcpybuff(liens[lien_tot]->former_adr,FA); \
  306.       strcpybuff(liens[lien_tot]->former_fil,FF); \
  307.     }\
  308.     strcpybuff(liens[lien_tot]->adr,A); \
  309.     strcpybuff(liens[lien_tot]->fil,F); \
  310.     strcpybuff(liens[lien_tot]->sav,S); \
  311.     liens_record_sav_len(liens[lien_tot]); \
  312.     hash_write(hashptr,lien_tot,opt->urlhack);  \
  313.   } \
  314. }
  315.  
  316. #define ENGINE_DEFINE_CONTEXT() \
  317.   ENGINE_DEFINE_CONTEXT_BASE(); \
  318.   /* */ \
  319.   htsblk* const r HTS_UNUSED = stre->r_; \
  320.   hash_struct* const hash HTS_UNUSED = stre->hash_; \
  321.   char* const codebase HTS_UNUSED = stre->codebase; \
  322.   char* const base HTS_UNUSED = stre->base; \
  323.   /* */ \
  324.   const char * const template_header HTS_UNUSED = stre->template_header_; \
  325.   const char * const template_body HTS_UNUSED = stre->template_body_; \
  326.   const char * const template_footer HTS_UNUSED = stre->template_footer_; \
  327.   /* */ \
  328.   char* const makeindex_firstlink = stre->makeindex_firstlink_; \
  329.   /* */ \
  330.   /* */ \
  331.   int error = * stre->error_; \
  332.   int store_errpage = * stre->store_errpage_; \
  333.   int lien_max = *stre->lien_max_; \
  334.   /* */ \
  335.   int makeindex_done = *stre->makeindex_done_; \
  336.   FILE* makeindex_fp = *stre->makeindex_fp_; \
  337.   int makeindex_links = *stre->makeindex_links_; \
  338.   /* */ \
  339.   LLint stat_fragment = *stre->stat_fragment_; \
  340.   TStamp makestat_time = stre->makestat_time; \
  341.   FILE* makestat_fp = stre->makestat_fp
  342.  
  343. #define ENGINE_SET_CONTEXT() \
  344.   ENGINE_SET_CONTEXT_BASE(); \
  345.   /* */ \
  346.   error = * stre->error_; \
  347.   store_errpage = * stre->store_errpage_; \
  348.   lien_max = *stre->lien_max_; \
  349.   /* */ \
  350.   makeindex_done = *stre->makeindex_done_; \
  351.   makeindex_fp = *stre->makeindex_fp_; \
  352.   makeindex_links = *stre->makeindex_links_; \
  353.   /* */ \
  354.   stat_fragment = *stre->stat_fragment_; \
  355.   makestat_time = stre->makestat_time; \
  356.   makestat_fp = stre->makestat_fp
  357.  
  358. #define ENGINE_LOAD_CONTEXT() \
  359.   ENGINE_DEFINE_CONTEXT()
  360.  
  361. #define ENGINE_SAVE_CONTEXT() \
  362.   ENGINE_SAVE_CONTEXT_BASE(); \
  363.   /* */ \
  364.   * stre->error_ = error; \
  365.   * stre->store_errpage_ = store_errpage; \
  366.   * stre->lien_max_ = lien_max; \
  367.   /* */ \
  368.   *stre->makeindex_done_ = makeindex_done; \
  369.   *stre->makeindex_fp_ = makeindex_fp; \
  370.   *stre->makeindex_links_ = makeindex_links; \
  371.   /* */ \
  372.   *stre->stat_fragment_ = stat_fragment
  373.  
  374. #define _FILTERS     (*opt->filters.filters)
  375. #define _FILTERS_PTR (opt->filters.filptr)
  376. #define _ROBOTS      ((robots_wizard*)opt->robotsptr)
  377.  
  378. /* Apply current *adr character for the script automate */
  379. #define AUTOMATE_LOOKUP_CURRENT_ADR() do { \
  380.   if (inscript) { \
  381.   int new_state_pos; \
  382.   new_state_pos=inscript_state[inscript_state_pos][(unsigned char)*adr]; \
  383.   if (new_state_pos < 0) { \
  384.   new_state_pos=inscript_state[inscript_state_pos][INSCRIPT_DEFAULT]; \
  385.   } \
  386.   assertf(new_state_pos >= 0); \
  387.   assertf(new_state_pos*sizeof(inscript_state[0]) < sizeof(inscript_state)); \
  388.   inscript_state_pos=new_state_pos; \
  389.   } \
  390. } while(0)  
  391.  
  392. /* Increment current pointer to 'steps' characters, modifying automate if necessary */
  393. #define INCREMENT_CURRENT_ADR(steps) do { \
  394.   int steps__ = (int) ( steps ); \
  395.   while(steps__ > 0) { \
  396.   adr++; \
  397.   AUTOMATE_LOOKUP_CURRENT_ADR(); \
  398.   steps__ --; \
  399.   } \
  400. } while(0)
  401.  
  402.  
  403. /* Main parser */
  404. int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) {
  405.     char catbuff[CATBUFF_SIZE];
  406.   /* Load engine variables */
  407.   ENGINE_LOAD_CONTEXT();
  408.  
  409.   {
  410.     char* cAddr = r->adr;
  411.     int cSize = (int) r->size;
  412.     if ( (opt->debug>0) && (opt->log!=NULL) ) {
  413.       HTS_LOG(opt,LOG_INFO); fprintf(opt->log,"engine: preprocess-html: %s%s"LF, urladr, urlfil);
  414.     }
  415.     if (RUN_CALLBACK4(opt, preprocess, &cAddr, &cSize, urladr, urlfil) == 1) {
  416.       r->adr = cAddr;
  417.       r->size = cSize;
  418.     }
  419.   }
  420.   if (RUN_CALLBACK4(opt, check_html, r->adr,(int)r->size,urladr,urlfil)) {
  421.     FILE* fp=NULL;      // fichier Θcrit localement 
  422.     char* adr=r->adr;    // pointeur (on parcourt)
  423.     char* lastsaved;    // adresse du dernier octet sauvΘ + 1
  424.     if ( (opt->debug>1) && (opt->log!=NULL) ) {
  425.       HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"scanning file %s%s (%s).."LF, urladr, urlfil, savename); test_flush;
  426.     }
  427.  
  428.  
  429.     // Indexing!
  430. #if HTS_MAKE_KEYWORD_INDEX
  431.     if (opt->kindex) {
  432.       if (index_keyword(r->adr,r->size,r->contenttype,savename,StringBuff(opt->path_html))) {
  433.         if ( (opt->debug>1) && (opt->log!=NULL) ) {
  434.           HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"indexing file..done"LF); test_flush;
  435.         }
  436.       } else {
  437.         if ( (opt->debug>1) && (opt->log!=NULL) ) {
  438.           HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"indexing file..error!"LF); test_flush;
  439.         }
  440.       }
  441.     }
  442. #endif
  443.  
  444.     // Now, parsing
  445.     if ((opt->getmode & 1) && (ptr>0)) {  // rΘcupΘrer les html sur disque       
  446.       // crΘer le fichier html local
  447.       HT_ADD_FOP;   // Θcrire peu α peu le fichier
  448.     }
  449.  
  450.     if (!error) {
  451.       time_t user_interact_timestamp = 0;
  452.       int detect_title=0;  // dΘtection  du title
  453.       int back_add_stats = opt->state.back_add_stats;
  454.       //
  455.       char* in_media=NULL; // in other media type (real media and so..)
  456.       int intag=0;         // on est dans un tag
  457.       int incomment=0;     // dans un <!--
  458.       int inscript=0;      // dans un scipt pour applets javascript)
  459.       signed char inscript_state[10][257];
  460.       typedef enum { 
  461.         INSCRIPT_START=0,
  462.         INSCRIPT_ANTISLASH,
  463.         INSCRIPT_INQUOTE,
  464.         INSCRIPT_INQUOTE2,
  465.         INSCRIPT_SLASH,
  466.         INSCRIPT_SLASHSLASH,
  467.         INSCRIPT_COMMENT,
  468.         INSCRIPT_COMMENT2,
  469.         INSCRIPT_ANTISLASH_IN_QUOTE,
  470.         INSCRIPT_ANTISLASH_IN_QUOTE2,
  471.         INSCRIPT_DEFAULT=256
  472.       } INSCRIPT;
  473.       INSCRIPT inscript_state_pos=INSCRIPT_START;
  474.       char* inscript_name=NULL; // script tag name
  475.       int inscript_tag=0;  // on est dans un <body onLoad="... terminΘ par >
  476.       char inscript_tag_lastc='\0';
  477.       // terminaison (" ou ') du "<body onLoad=.."
  478.       int inscriptgen=0;     // on est dans un code gΘnΘrant, ex aprΦs obj.write("..
  479.       //int inscript_check_comments=0, inscript_in_comments=0;    // javascript comments
  480.       char scriptgen_q='\0'; // caractΦre faisant office de guillemet (' ou ")
  481.       //int no_esc_utf=0;      // ne pas echapper chars > 127
  482.       int nofollow=0;        // ne pas scanner
  483.       //
  484.       int parseall_lastc='\0';     // dernier caractΦre parsΘ pour parseall
  485.       //int parseall_incomment=0;   // dans un /* */ (exemple: a = /* URL */ "img.gif";)
  486.       //
  487.       char* intag_start = adr;
  488.             char* intag_name = NULL;
  489.       char* intag_startattr=NULL;
  490.       int intag_start_valid=0;
  491.       int intag_ctype=0;
  492.       //
  493.       int   parent_relative=0;    // the parent is the base path (.js, .css..)
  494.       HT_ADD_START;    // dΘbuter
  495.       lastsaved=adr;
  496.  
  497.       /* Initialize script automate for comments, quotes.. */
  498.       memset(inscript_state, 0xff, sizeof(inscript_state));
  499.       inscript_state[INSCRIPT_START][INSCRIPT_DEFAULT]=INSCRIPT_START;     /* by default, stay in START */
  500.       inscript_state[INSCRIPT_START]['\\']=INSCRIPT_ANTISLASH;             /* #1: \ escapes the next character whatever it is */
  501.       inscript_state[INSCRIPT_ANTISLASH][INSCRIPT_DEFAULT]=INSCRIPT_START;
  502.       inscript_state[INSCRIPT_START]['\'']=INSCRIPT_INQUOTE;               /* #2: ' opens quote and only ' returns to 0 */
  503.       inscript_state[INSCRIPT_INQUOTE][INSCRIPT_DEFAULT]=INSCRIPT_INQUOTE;
  504.       inscript_state[INSCRIPT_INQUOTE]['\'']=INSCRIPT_START;
  505.       inscript_state[INSCRIPT_INQUOTE]['\\']=INSCRIPT_ANTISLASH_IN_QUOTE;
  506.       inscript_state[INSCRIPT_START]['\"']=INSCRIPT_INQUOTE2;              /* #3: " opens double-quote and only " returns to 0 */
  507.       inscript_state[INSCRIPT_INQUOTE2][INSCRIPT_DEFAULT]=INSCRIPT_INQUOTE2;
  508.       inscript_state[INSCRIPT_INQUOTE2]['\"']=INSCRIPT_START;
  509.       inscript_state[INSCRIPT_INQUOTE2]['\\']=INSCRIPT_ANTISLASH_IN_QUOTE2;
  510.       inscript_state[INSCRIPT_START]['/']=INSCRIPT_SLASH;                  /* #4: / state, default to #0 */
  511.       inscript_state[INSCRIPT_SLASH][INSCRIPT_DEFAULT]=INSCRIPT_START;
  512.       inscript_state[INSCRIPT_SLASH]['/']=INSCRIPT_SLASHSLASH;             /* #5: // with only LF to escape */
  513.       inscript_state[INSCRIPT_SLASHSLASH][INSCRIPT_DEFAULT]=INSCRIPT_SLASHSLASH;
  514.       inscript_state[INSCRIPT_SLASHSLASH]['\n']=INSCRIPT_START;
  515.       inscript_state[INSCRIPT_SLASH]['*']=INSCRIPT_COMMENT;                /* #6: / * with only * / to escape */
  516.       inscript_state[INSCRIPT_COMMENT][INSCRIPT_DEFAULT]=INSCRIPT_COMMENT;
  517.       inscript_state[INSCRIPT_COMMENT]['*']=INSCRIPT_COMMENT2;             /* #7: closing comments */
  518.       inscript_state[INSCRIPT_COMMENT2][INSCRIPT_DEFAULT]=INSCRIPT_COMMENT;
  519.       inscript_state[INSCRIPT_COMMENT2]['/']=INSCRIPT_START;
  520.       inscript_state[INSCRIPT_COMMENT2]['*']=INSCRIPT_COMMENT2;
  521.       inscript_state[INSCRIPT_ANTISLASH_IN_QUOTE][INSCRIPT_DEFAULT]=INSCRIPT_INQUOTE;    /* #8: escape in "" */
  522.       inscript_state[INSCRIPT_ANTISLASH_IN_QUOTE2][INSCRIPT_DEFAULT]=INSCRIPT_INQUOTE2;  /* #9: escape in '' */
  523.  
  524.       /* Primary list or URLs */
  525.       if (ptr == 0) {
  526.         intag=1;
  527.         intag_start_valid=0;
  528.                 intag_name = NULL;
  529.       }
  530.       /* Check is the file is a .js file */
  531.       else if (
  532.         (compare_mime(opt,r->contenttype, str->url_file, "application/x-javascript")!=0)
  533.         || (compare_mime(opt,r->contenttype, str->url_file, "text/css")!=0)
  534.         ) {      /* JavaScript js file */
  535.           inscript=1;
  536.           if (opt->parsedebug) { HT_ADD("<@@ inscript @@>"); }
  537.           inscript_name="script";
  538.           intag=1;     // because aprΦs <script> on y est .. - pas utile
  539.           intag_start_valid=0;    // OUI car nous sommes dans du code, plus dans du "vrai" tag
  540.           if ((opt->debug>1) && (opt->log!=NULL)) {
  541.             HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"note: this file is a javascript file"LF); test_flush;
  542.           }
  543.           // for javascript only
  544.           if (compare_mime(opt,r->contenttype, str->url_file, "application/x-javascript") != 0) {
  545.             // all links must be checked against parent, not this link
  546.             if (liens[ptr]->precedent != 0) {
  547.               parent_relative=1;
  548.             }
  549.           }
  550.         }
  551.         /* Or a real audio */
  552.       else if (compare_mime(opt,r->contenttype, str->url_file, "audio/x-pn-realaudio")!=0) {      /* realaudio link file */
  553.         inscript=intag=0;
  554.         inscript_name="media";
  555.         intag_start_valid=0;
  556.         in_media="LNK";       // real media! -> links
  557.       } 
  558.       /* Or a m3u playlist */
  559.       else if (compare_mime(opt,r->contenttype, str->url_file, "audio/x-mpegurl")!=0) {      /* mp3 link file */
  560.         inscript=intag=0;
  561.         inscript_name="media";
  562.         intag_start_valid=0;
  563.         in_media="LNK";       // m3u! -> links
  564.       } 
  565.       else if (compare_mime(opt,r->contenttype, str->url_file, "application/x-authorware-map")!=0) {      /* macromedia aam file */
  566.         inscript=intag=0;
  567.         inscript_name="media";
  568.         intag_start_valid=0;
  569.         in_media="AAM";       // aam
  570.       } 
  571.       /* Or a RSS file */
  572.       else if (
  573.         compare_mime(opt,r->contenttype, str->url_file, "text/xml") != 0
  574.         || compare_mime(opt,r->contenttype, str->url_file, "application/xml") != 0
  575.         ) 
  576.       {
  577.         if (strstr(adr, "http://purl.org/rss/") != NULL) // Hmm, this is a bit lame ; will have to cleanup
  578.         {      /* RSS file */
  579.           inscript=intag=0;
  580.           intag_start_valid=0;
  581.           in_media=NULL;       // regular XML
  582.         } else {   // cancel: write all
  583.           adr = r->adr + r->size;
  584.           HT_ADD_ADR;
  585.           lastsaved=adr;
  586.         }
  587.       }
  588.  
  589.       // Detect UTF8 format
  590.       //if (is_unicode_utf8((unsigned char*) r->adr, (unsigned int) r->size) == 1) {
  591.       //  no_esc_utf=1;
  592.       //} else {
  593.       //  no_esc_utf=0;
  594.       //}
  595.  
  596.             // Hack to prevent any problems with ram files of other files
  597.       * ( r->adr + r->size ) = '\0';
  598.  
  599.       // ------------------------------------------------------------
  600.       // analyser ce qu'il y a en mΘmoire (fichier html)
  601.       // on scanne les balises
  602.       // ------------------------------------------------------------
  603.       opt->state._hts_in_html_done=0;     // 0% scannΘs
  604.       opt->state._hts_in_html_parsing=1;  // flag pour indiquer un parsing
  605.  
  606.             base[0]='\0';    // effacer base-href
  607.       do {
  608.         int p=0;
  609.         int valid_p=0;      // force to take p even if == 0
  610.         int ending_p='\0';  // ending quote?
  611.         int archivetag_p=0;  // avoid multiple-archives with commas
  612.         int  unquoted_script=0;
  613.         INSCRIPT inscript_state_pos_prev=inscript_state_pos;
  614.         error=0;
  615.  
  616.         /* Break if we are done yet */
  617.         if ( ( adr - r->adr ) >= r->size)
  618.           break;
  619.  
  620.         /* Hack to avoid NULL char problems with C syntax */
  621.         /* Yes, some bogus HTML pages can embed null chars
  622.         and therefore can not be properly handled if this hack is not done
  623.         */
  624.         if ( ! (*adr) ) {
  625.           if ( ((int) (adr - r->adr)) < r->size)
  626.             *adr=' ';
  627.         }
  628.  
  629.         /*
  630.         index.html built here
  631.         */
  632.         // Construction index.html (sommaire)
  633.         // Avant de tester les a href,
  634.         // Ici on teste si l'on doit construire l'index vers le(s) site(s) miroir(s)
  635.         if (!makeindex_done) {  // autoriation d'Θcrire un index
  636.           if (!detect_title) {
  637.             if (opt->depth == liens[ptr]->depth) {    // on note toujours les premiers liens
  638.               if (!in_media) {
  639.                 if (opt->makeindex && (ptr>0)) {
  640.                   if (opt->getmode & 1) {  // autorisation d'Θcrire
  641.                     p=strfield(adr,"title");  
  642.                     if (p) {
  643.                       if (*(adr-1)=='/') p=0;    // /title
  644.                     } else {
  645.                       if (strfield(adr,"/html"))
  646.                         p=-1;                    // noter, mais sans titre
  647.                       else if (strfield(adr,"body"))
  648.                         p=-1;                    // noter, mais sans titre
  649.                       else if ( ((int) (adr - r->adr) ) >= (r->size-1) )
  650.                         p=-1;                    // noter, mais sans titre
  651.                       else if ( (int) (adr - r->adr) >= r->size - 2)   // we got to hurry
  652.                         p=-1; // xxc xxc xxc
  653.                     }
  654.                   } else
  655.                     p=0;
  656.  
  657.                   if (p) {    // ok center                            
  658.                     if (makeindex_fp==NULL) {
  659.                       file_notify(opt,"", "", fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_html),"index.html"), 1, 1, 0);
  660.                       verif_backblue(opt,StringBuff(opt->path_html));    // gΘnΘrer gif
  661.                       makeindex_fp=filecreate(&opt->state.strc, fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_html),"index.html"));
  662.                       if (makeindex_fp!=NULL) {
  663.  
  664.                         // Header
  665.                         fprintf(makeindex_fp,template_header,
  666.                           "<!-- Mirror and index made by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->"
  667.                           );
  668.  
  669.                       } else makeindex_done=-1;    // fait, erreur
  670.                     }
  671.  
  672.                     if (makeindex_fp!=NULL) {
  673.                       char BIGSTK tempo[HTS_URLMAXSIZE*2];
  674.                       char BIGSTK s[HTS_URLMAXSIZE*2];
  675.                       char* a=NULL;
  676.                       char* b=NULL;
  677.                       s[0]='\0';
  678.                       if (p>0) {
  679.                         a=strchr(adr,'>');
  680.                         if (a!=NULL) {
  681.                           a++;
  682.                           while(is_space(*a)) a++;    // sauter espaces & co
  683.                           b=strchr(a,'<');   // prochain tag
  684.                         }
  685.                       }
  686.                       if (lienrelatif(tempo,liens[ptr]->sav,concat(OPT_GET_BUFF(opt),StringBuff(opt->path_html),"index.html"))==0) {
  687.                         detect_title=1;      // ok dΘtectΘ pour cette page!
  688.                         makeindex_links++;   // un de plus
  689.                         strcpybuff(makeindex_firstlink,tempo);
  690.                         //
  691.  
  692.                         /* Hack */
  693.                         if (opt->mimehtml) {
  694.                           strcpybuff(makeindex_firstlink, "cid:primary/primary");
  695.                         }
  696.  
  697.                         if ((b==a) || (a==NULL) || (b==NULL)) {    // pas de titre
  698.                           strcpybuff(s,tempo);
  699.                         } else if ((b-a)<256) {
  700.                           b--;
  701.                           while(is_space(*b)) b--;
  702.                           strncpy(s,a,b-a+1);
  703.                           *(s+(b-a)+1)='\0';
  704.                         }
  705.  
  706.                         // Body
  707.                         fprintf(makeindex_fp,template_body,
  708.                           tempo,
  709.                           s
  710.                           );
  711.  
  712.                       }
  713.                     }
  714.                   }
  715.                 }
  716.               }
  717.  
  718.             } else if (liens[ptr]->depth<opt->depth) {   // on a sautΘ level1+1 et level1
  719.               HT_INDEX_END;
  720.             }
  721.           } // if (opt->makeindex)
  722.         }
  723.         // FIN Construction index.html (sommaire)
  724.         /*
  725.         end -- index.html built here
  726.         */
  727.  
  728.  
  729.  
  730.         /* Parse */
  731.         if (
  732.           (*adr=='<')    /* No starting tag */
  733.           && (!inscript)    /* Not in (java)script */
  734.           && (!incomment)   /* Not in comment (<!--) */
  735.           && (!in_media)    /* Not in media */
  736.           ) { 
  737.             intag=1;
  738.                         intag_ctype=0;
  739.                         //parseall_incomment=0;
  740.                         //inquote=0;  // effacer quote
  741.                         intag_start = adr;
  742.                         for(intag_name = adr + 1 ; is_realspace(*intag_name) ; intag_name++ );
  743.                         intag_start_valid = 1;
  744.                         codebase[0]='\0';    // effacer Θventuel codebase
  745.  
  746.                         /* Meta ? */
  747.                         if (check_tag(intag_start, "meta")) {
  748.                             int pos;
  749.                             // <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
  750.                             if ((pos = rech_tageq_all(adr, "http-equiv"))) {
  751.                                 const char* token = NULL;
  752.                                 int len = rech_endtoken(adr + pos, &token);
  753.                                 if (len > 0) {
  754.                                     if (strfield(token, "content-type")) {
  755.                                         intag_ctype=1;
  756.                                     }
  757.                                     else if (strfield(token, "refresh")) {
  758.                                         intag_ctype=2;
  759.                                     }
  760.                                 }
  761.                             }
  762.                         }
  763.  
  764.             if (opt->getmode & 1) {  // sauver html
  765.               p=strfield(adr,"</html");
  766.               if (p==0) p=strfield(adr,"<head>");
  767.               // if (p==0) p=strfield(adr,"<doctype");
  768.               if (p) {
  769.                 char* eol="\n";
  770.                 if (strchr(r->adr,'\r'))
  771.                   eol="\r\n";
  772.                 if (StringNotEmpty(opt->footer) || opt->urlmode != 4) {  /* != preserve */
  773.                                     if (StringNotEmpty(opt->footer)) {
  774.                       char BIGSTK tempo[1024+HTS_URLMAXSIZE*2];
  775.                                         char gmttime[256];
  776.                                         tempo[0]='\0';
  777.                                         time_gmt_rfc822(gmttime);
  778.                                         strcatbuff(tempo,eol);
  779.                       sprintf(tempo+strlen(tempo),StringBuff(opt->footer),jump_identification(urladr),urlfil,gmttime,HTTRACK_VERSIONID,"","","","","","","");
  780.                         strcatbuff(tempo,eol);
  781.                                         //fwrite(tempo,1,strlen(tempo),fp);
  782.                                         HT_ADD(tempo);
  783.                                     }
  784.                   if (strnotempty(r->charset)) {
  785.                     HT_ADD("<!-- Added by HTTrack --><meta http-equiv=\"content-type\" content=\"text/html;charset=");
  786.                     HT_ADD(r->charset);
  787.                     HT_ADD("\"><!-- /Added by HTTrack -->");
  788.                     HT_ADD(eol);
  789.                   }
  790.                 }
  791.               }
  792.             }        
  793.  
  794.             // Θliminer les <!-- (commentaires) : intag dΘvalidΘ
  795.             if (*(adr+1)=='!')
  796.               if (*(adr+2)=='-')
  797.                 if (*(adr+3)=='-') {
  798.                   intag=0;
  799.                   incomment=1;
  800.                   intag_start_valid=0;
  801.                 }
  802.  
  803.           }
  804.         else if (
  805.           (*adr=='>')                        /* ending tag */
  806.           && ( (!inscript && !in_media) || (inscript_tag) )  /* and in tag (or in script) */
  807.           ) {
  808.             if (inscript_tag) {
  809.               inscript_tag=inscript=0;
  810.               intag=0;
  811.               incomment=0;
  812.               intag_start_valid=0;
  813.                             intag_name = NULL;
  814.               if (opt->parsedebug) { HT_ADD("<@@ /inscript @@>"); }
  815.             } else if (!incomment) {
  816.               intag=0; //inquote=0;
  817.  
  818.               // entrΘe dans du javascript?
  819.               // on parse ICI car il se peut qu'on ait eu a parser les src=.. dedans
  820.               //if (!inscript) {  // sinon on est dans un obj.write("..
  821.               if ((intag_start_valid) && 
  822.                 (
  823.                 check_tag(intag_start,"script")
  824.                 ||
  825.                 check_tag(intag_start,"style")
  826.                 )
  827.                 ) {
  828.                   char* a=intag_start;    // <
  829.                   // ** while(is_realspace(*(--a)));
  830.                   if (*a=='<') {  // s√r que c'est un tag?
  831.                     if (check_tag(intag_start,"script"))
  832.                       inscript_name="script";
  833.                     else
  834.                       inscript_name="style";
  835.                     inscript=1;
  836.                     inscript_state_pos=INSCRIPT_START;
  837.                     intag=1;     // because aprΦs <script> on y est .. - pas utile
  838.                     intag_start_valid=0;    // OUI car nous sommes dans du code, plus dans du "vrai" tag
  839.                     if (opt->parsedebug) { HT_ADD("<@@ inscript @@>"); }
  840.                   }
  841.                 }
  842.             } else {                               /* end of comment? */
  843.               // vΘrifier fermeture correcte
  844.               if ( (*(adr-1)=='-') && (*(adr-2)=='-') ) {
  845.                 intag=0;
  846.                 incomment=0;
  847.                 intag_start_valid=0;
  848.                                 intag_name = NULL;
  849.               }
  850. #if GT_ENDS_COMMENT
  851.               /* wrong comment ending */
  852.               else {
  853.                 /* check if correct ending does not exists
  854.                 <!-- foo > example <!-- bar > is sometimes accepted by browsers
  855.                 when no --> is used somewhere else.. darn those browsers are dirty
  856.                 */
  857.                 if (!strstr(adr,"-->")) {
  858.                   intag=0;
  859.                   incomment=0;
  860.                   intag_start_valid=0;
  861.                                     intag_name = NULL;
  862.                 }
  863.               }
  864. #endif
  865.             }
  866.             //}
  867.           }
  868.           //else if (*adr==34) {
  869.           //  inquote=(inquote?0:1);
  870.           //}
  871.         else if (intag || inscript || in_media) {    // nous sommes dans un tag/commentaire, tester si on recoit un tag
  872.           int p_type=0;
  873.           int p_nocatch=0;
  874.           int p_searchMETAURL=0;  // chercher ..URL=<url>
  875.           int add_class=0;        // ajouter .class
  876.           int add_class_dots_to_patch=0;   // number of '.' in code="x.y.z<realname>"
  877.           char* p_flush=NULL;
  878.  
  879.  
  880.           // ------------------------------------------------------------
  881.           // parsing ΘvolΘ
  882.           // ------------------------------------------------------------
  883.           if (((isalpha((unsigned char)*adr)) || (*adr=='/') || (inscript) || (in_media) || (inscriptgen))) {  // sinon pas la peine de tester..
  884.  
  885.  
  886.             /* caractΦre de terminaison pour "miniparsing" javascript=.. ? 
  887.             (ex: <a href="javascript:()" action="foo"> ) */
  888.             if (inscript_tag) {
  889.               if (inscript_tag_lastc) {
  890.                 if (*adr == inscript_tag_lastc) {
  891.                   /* sortir */
  892.                   inscript_tag=inscript=0;
  893.                   incomment=0;
  894.                   if (opt->parsedebug) { HT_ADD("<@@ /inscript @@>"); }
  895.                 }
  896.               }
  897.             }
  898.  
  899.             /* automate */
  900.             AUTOMATE_LOOKUP_CURRENT_ADR();
  901.  
  902.  
  903.             // Note:
  904.             // Certaines pages ne respectent pas le html
  905.             // notamment les guillements ne sont pas fixΘs
  906.             // Nous sommes dans un tag, donc on peut faire un test plus
  907.             // large pour pouvoi prendre en compte ces particularitΘs
  908.  
  909.             // α vΘrifier: ACTION, CODEBASE, VRML
  910.  
  911.             if (in_media) {
  912.               if (strcmp(in_media,"LNK")==0) { // real media
  913.                 p=0;
  914.                 valid_p=1;
  915.               }
  916.               else if (strcmp(in_media, "AAM")==0) { // AAM
  917.                 if (is_space((unsigned char)adr[0]) && ! is_space((unsigned char)adr[1])) {
  918.                   char* a = adr + 1;
  919.                   int n = 0;
  920.                   int ok = 0;
  921.                   int dot = 0;
  922.                   while(n < HTS_URLMAXSIZE/2 && a[n] != '\0' &&
  923.                     ( ! is_space((unsigned char)a[n]) || ! ( ok = 1) )
  924.                     ) {
  925.                       if (a[n] == '.') {
  926.                         dot = n;
  927.                       }
  928.                       n++;
  929.                     }
  930.                     if (ok && dot > 0) {
  931.                       char BIGSTK tmp[HTS_URLMAXSIZE/2 + 2];
  932.                       tmp[0] = '\0';
  933.                       strncat(tmp, a + dot + 1, n - dot - 1);
  934.                       if (is_knowntype(opt,tmp) || ishtml_ext(tmp) != -1) {
  935.                         adr++;
  936.                         p = 0;
  937.                         valid_p = 1;
  938.                         unquoted_script = 1;
  939.                       }
  940.                     }
  941.                 }
  942.               }
  943.             } else if (ptr>0) {        /* pas premiΦre page 0 (primary) */
  944.               p=0;  // saut pour le nom de fichier: adresse nom fichier=adr+p
  945.  
  946.               // ------------------------------
  947.               // dΘtection d'Θcriture JavaScript.
  948.               // osons les obj.write et les obj.href=.. ! osons!
  949.               // note: inscript==1 donc on sautera aprΦs les \"
  950.               if (inscript) {
  951.                 if (inscriptgen) {          // on est dΘja dans un objet gΘnΘrant..
  952.                   if (*adr==scriptgen_q) {  // fermeture des " ou '
  953.                     if (*(adr-1)!='\\') {   // non
  954.                       inscriptgen=0;        // ok parsing terminΘ
  955.                     }
  956.                   }
  957.                 } else {
  958.                   char* a=NULL;
  959.                   char check_this_fking_line=0;  // parsing code javascript..
  960.                   char must_be_terminated=0;     // caractΦre obligatoire de terminaison!
  961.                   int token_size;
  962.                   if (!(token_size=strfield(adr,".writeln"))) // dΘtection ...objet.write[ln]("code html")...
  963.                     token_size=strfield(adr,".write");
  964.                   if (token_size) {
  965.                     a=adr+token_size;
  966.                     while(is_realspace(*a)) a++; // sauter espaces
  967.                     if (*a=='(') {  // dΘbut parenthΦse
  968.                       check_this_fking_line=2;  // α parser!
  969.                       must_be_terminated=')';
  970.                       a++;  // sauter (
  971.                     }
  972.                   }
  973.                   // euhh ??? ???
  974.                   /* else if (strfield(adr,".href")) {  // dΘtection ...objet.href="...
  975.                   a=adr+5;
  976.                   while(is_realspace(*a)) a++; // sauter espaces
  977.                   if (*a=='=') {  // ohh un Θgal
  978.                   check_this_fking_line=1;  // α noter!
  979.                   must_be_terminated=';';   // et si t'as oubliΘ le ; tu sais pas coder
  980.                   a++;   // sauter =
  981.                   }
  982.  
  983.                   }*/
  984.  
  985.                   // on a un truc du genre instruction"code gΘnΘrΘ" dont on parse le code
  986.                   if (check_this_fking_line) {
  987.                     while(is_realspace(*a)) a++;
  988.                     if ((*a=='\'') || (*a=='"')) {  // dΘpart de '' ou ""
  989.                       char *b;
  990.                       scriptgen_q=*a;    // quote
  991.                       b=a+1;      // dΘpart de la chaεne
  992.                       // vΘrifier forme ("code") et pas ("code"+var), ingΘrable
  993.                       do {
  994.                         if (*a==scriptgen_q && *(a-1)!='\\')  // quote non slash
  995.                           break;            // sortie
  996.                         else if (*a==10 && *(a-1) != '\\'  /* LF and no continue (\) character */
  997.                           && ( *(a-1) != '\r' || *(a-2) != '\\' ) )  /* and not CRLF and no .. */
  998.                           break;
  999.                         else 
  1000.                           a++;  // caractΦre suivant
  1001.                       } while((a-b) < HTS_URLMAXSIZE / 2);
  1002.                       if (*a==scriptgen_q) {  // fin du quote
  1003.                         a++;
  1004.                         while(is_realspace(*a)) a++;
  1005.                         if (*a==must_be_terminated) {  // parenthΦse fermante: ("..")
  1006.  
  1007.                           // bon, on doit parser une ligne javascript
  1008.                           // 1) si check.. ==1 alors c'est un nom de fichier direct, donc
  1009.                           // on fixe p sur le saut nΘcessaire pour atteindre le nom du fichier
  1010.                           // et le moteur se dΘbrouillera ensuite tout seul comme un grand
  1011.                           // 2) si check==2 c'est un peu plus tordu car lα on gΘnΘre du
  1012.                           // code html au sein de code javascript au sein de code html
  1013.                           // dans ce cas on doit fixer un flag α un puis ensuite dans la boucle
  1014.                           // on devra parser les instructions standard comme <a href etc
  1015.                           // NOTE: le code javascript autogΘnΘrΘ n'est pas pris en compte!!
  1016.                           // (et ne marche pas dans 50% des cas de toute facon!)
  1017.                           if (check_this_fking_line==1) {
  1018.                             p=(int) (b - adr);    // calculer saut!
  1019.                           } else {
  1020.                             inscriptgen=1;        // SCRIPTGEN actif
  1021.                             adr=b;                // jump
  1022.                           }
  1023.  
  1024.                           if ((opt->debug>1) && (opt->log!=NULL)) {
  1025.                             char str[512];
  1026.                             str[0]='\0';
  1027.                             strncatbuff(str,b,minimum((int) (a - b + 1), 32));
  1028.                             HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"active code (%s) detected in javascript: %s"LF,(check_this_fking_line==2)?"parse":"pickup",str); test_flush;
  1029.                           }
  1030.                         }
  1031.  
  1032.                       }
  1033.  
  1034.                     }
  1035.  
  1036.  
  1037.                   }
  1038.                 }
  1039.               }
  1040.               // fin detection code gΘnΘrant javascript vers html
  1041.               // ------------------------------
  1042.  
  1043.  
  1044.               // analyse proprement dite, A HREF=.. etc..
  1045.               if (!p) {
  1046.                 // si dans un tag, et pas dans un script - sauf si on analyse un obj.write("..
  1047.                 if ((intag && (!inscript)) || inscriptgen) {
  1048.                   if ( (*(adr-1)=='<') || (is_space(*(adr-1))) ) {   // <tag < tag etc
  1049.                     // <A HREF=.. pour les liens HTML
  1050.                     p=rech_tageq(adr,"href");
  1051.                     if (p) {    // href.. tester si c'est une bas href!
  1052.                       if ((intag_start_valid) && check_tag(intag_start, "base")) {  // oui!
  1053.                         // ** note: base href et codebase ne font pas bon mΘnage..
  1054.                         p_type=2;    // c'est un chemin
  1055.                       }
  1056.                     }
  1057.  
  1058.                     /* Tags supplΘmentaires α vΘrifier (<img src=..> etc) */
  1059.                     if (p==0) {
  1060.                       int i=0;
  1061.                       while( (p==0) && (strnotempty(hts_detect[i])) ) {
  1062.                         p=rech_tageq(adr,hts_detect[i]);
  1063.                         if (p) {
  1064.                           /* This is a temporary hack to avoid archive=foo.jar,bar.jar .. */
  1065.                           if (strcmp(hts_detect[i], "archive") == 0) {
  1066.                             archivetag_p = 1;
  1067.                           }
  1068.                         }
  1069.                         i++;
  1070.                       }
  1071.                     }
  1072.  
  1073.                     /* Tags supplΘmentaires en dΘbut α vΘrifier (<object .. hotspot1=..> etc) */
  1074.                     if (p==0) {
  1075.                       int i=0;
  1076.                       while( (p==0) && (strnotempty(hts_detectbeg[i])) ) {
  1077.                         p=rech_tageqbegdigits(adr,hts_detectbeg[i]);
  1078.                         i++;
  1079.                       }
  1080.                     }
  1081.  
  1082.                     /* Tags supplΘmentaires α vΘrifier : URL=.. */
  1083.                     if (p==0) {
  1084.                       int i=0;
  1085.                       while( (p==0) && (strnotempty(hts_detectURL[i])) ) {
  1086.                         p=rech_tageq(adr,hts_detectURL[i]);
  1087.                         i++;
  1088.                       }
  1089.                       if (p) {
  1090.                         if (intag_ctype == 1) {
  1091.                           p = 0;
  1092. #if 0
  1093.                           //if ((pos=rech_tageq(adr, "content"))) {
  1094.                           char temp[256];
  1095.                           char* token = NULL;
  1096.                           int len = rech_endtoken(adr + pos, &token);
  1097.                           if (len > 0 && len < sizeof(temp) - 2) {
  1098.                             char* chpos;
  1099.                             temp[0] = '\0';
  1100.                             strncat(temp, token, len);
  1101.                             if ((chpos = strstr(temp, "charset"))
  1102.                               &&
  1103.                               (chpos = strchr(chpos, '='))
  1104.                               ) {
  1105.                                 chpos++;
  1106.                                 while(is_space(*chpos)) chpod++;
  1107.                                 chpos
  1108.                               }
  1109.                           }
  1110. #endif
  1111.                         }
  1112.                         // <META HTTP-EQUIV="Refresh" CONTENT="3;URL=http://www.example.com">
  1113.                         else if (intag_ctype == 2) {
  1114.                           p_searchMETAURL=1;
  1115.                         } else {
  1116.                           p = 0;            /* cancel */
  1117.                         }
  1118.                       }
  1119.  
  1120.  
  1121.                     }
  1122.  
  1123.                     /* Tags supplΘmentaires α vΘrifier, mais α ne pas capturer */
  1124.                     if (p==0) {
  1125.                       int i=0;
  1126.                       while( (p==0) && (strnotempty(hts_detectandleave[i])) ) {
  1127.                         p=rech_tageq(adr,hts_detectandleave[i]);
  1128.                         i++;
  1129.                       }
  1130.                       if (p)
  1131.                         p_nocatch=1;      /* ne pas rechercher */
  1132.                     }
  1133.  
  1134.                     /* EvΘnements */
  1135.                     if (p==0 && 
  1136.                       ! inscript          /* we don't want events inside document.write */
  1137.                       ) {
  1138.                         int i=0;
  1139.                         /* dΘtection onLoad etc */
  1140.                         while( (p==0) && (strnotempty(hts_detect_js[i])) ) {
  1141.                           p=rech_tageq(adr,hts_detect_js[i]);
  1142.                           i++;
  1143.                         }
  1144.                         /* non dΘtectΘ - dΘtecter Θgalement les onXxxxx= */
  1145.                         if (p==0) {
  1146.                           if ( (*adr=='o') && (*(adr+1)=='n') && isUpperLetter(*(adr+2)) ) {
  1147.                             p=0;
  1148.                             while(isalpha((unsigned char)adr[p]) && (p<64) ) p++;
  1149.                             if (p<64) {
  1150.                               while(is_space(adr[p])) p++;
  1151.                               if (adr[p]=='=')
  1152.                                 p++;
  1153.                               else p=0;
  1154.                             } else p=0;
  1155.                           }
  1156.                         }
  1157.                         /* OK, ΘvΘnement repΘrΘ */
  1158.                         if (p) {
  1159.                           inscript_tag_lastc=*(adr+p);     /* α attendre α la fin */
  1160.                           adr+=p+1;   /* saut */
  1161.                           /*
  1162.                           On est dΘsormais dans du code javascript
  1163.                           */
  1164.                           inscript_name="";
  1165.                           inscript=inscript_tag=1;
  1166.                           inscript_state_pos=INSCRIPT_START;
  1167.                           if (opt->parsedebug) { HT_ADD("<@@ inscript @@>"); }
  1168.                         }
  1169.                         p=0;        /* quoi qu'il arrive, ne rien dΘmarrer ici */
  1170.                       }
  1171.  
  1172.                       // <APPLET CODE=.. pour les applet java.. [CODEBASE (chemin..) α faire]
  1173.                       if (p==0) {
  1174.                         p=rech_tageq(adr,"code");
  1175.                         if (p) {
  1176.                           if ((intag_start_valid) && check_tag(intag_start,"applet")) {  // dans un <applet !
  1177.                             p_type=-1;  // juste le nom de fichier+dossier, Θcire avant codebase 
  1178.                             add_class=1;   // ajouter .class au besoin                         
  1179.  
  1180.                             // vΘrifier qu'il n'y a pas de codebase APRES
  1181.                             // sinon on swappe les deux.
  1182.                             // pas trΦs propre mais c'est ce qu'il y a de plus simple α faire!!
  1183.  
  1184.                             {
  1185.                               char *a;
  1186.                               a=adr;
  1187.                               while((*a) && (*a!='>') && (!rech_tageq(a,"codebase"))) a++;
  1188.                               if (rech_tageq(a,"codebase")) {  // banzai! codebase=
  1189.                                 char* b;
  1190.                                 b=strchr(a,'>');
  1191.                                 if (b) {
  1192.                                   if (((int) (b - adr)) < 1000) {    // au total < 1Ko
  1193.                                     char BIGSTK tempo[HTS_URLMAXSIZE*2];
  1194.                                     tempo[0]='\0';
  1195.                                     strncatbuff(tempo,a,(int) (b - a) );
  1196.                                     strcatbuff( tempo," ");
  1197.                                     strncatbuff(tempo,adr,(int) (a - adr - 1));
  1198.                                     // Θventuellement remplire par des espaces pour avoir juste la taille
  1199.                                     while((int) strlen(tempo)<((int) (b - adr)))
  1200.                                       strcatbuff(tempo," ");
  1201.                                     // pas d'erreur?
  1202.                                     if ((int) strlen(tempo) == ((int) (b - adr) )) {
  1203.                                       strncpy(adr,tempo,strlen(tempo));   // PAS d'octet nul α la fin!
  1204.                                       p=0;    // DEVALIDER!!
  1205.                                       p_type=0;
  1206.                                       add_class=0;
  1207.                                     }
  1208.                                   }
  1209.                                 }
  1210.                               }
  1211.                             }
  1212.  
  1213.                           }
  1214.                         }
  1215.                       }
  1216.  
  1217.                       // liens α patcher mais pas α charger (ex: codebase)
  1218.                       if (p==0) {  // note: si non chargΘ (ex: ignorer .class) patchΘ tout de mΩme
  1219.                         p=rech_tageq(adr,"codebase");
  1220.                         if (p) {
  1221.                           if ((intag_start_valid) && check_tag(intag_start,"applet")) {  // dans un <applet !
  1222.                             p_type=-2;
  1223.                           } else p=-1;   // ne plus chercher
  1224.                         }
  1225.                       }
  1226.  
  1227.  
  1228.                       // Meta tags pour robots
  1229.                       if (p==0) {
  1230.                         if (opt->robots) {
  1231.                           if ((intag_start_valid) && check_tag(intag_start,"meta")) {
  1232.                             if (rech_tageq(adr,"name")) {    // name=robots.txt
  1233.                               char tempo[1100];
  1234.                               char* a;
  1235.                               tempo[0]='\0';
  1236.                               a=strchr(adr,'>');
  1237. #if DEBUG_ROBOTS
  1238.                               printf("robots.txt meta tag detected\n");
  1239. #endif
  1240.                               if (a) {
  1241.                                 if (((int) (a - adr)) < 999 ) {
  1242.                                   strncatbuff(tempo,adr,(int) (a - adr));
  1243.                                   if (strstrcase(tempo,"content")) {
  1244.                                     if (strstrcase(tempo,"robots")) {
  1245.                                       if (strstrcase(tempo,"nofollow")) {
  1246. #if DEBUG_ROBOTS
  1247.                                         printf("robots.txt meta tag: nofollow in %s%s\n",urladr,urlfil);
  1248. #endif
  1249.                                         nofollow=1;       // NE PLUS suivre liens dans cette page
  1250.                                         if (opt->log) {
  1251.                                           HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"Link %s%s not scanned (follow robots meta tag)"LF,urladr,urlfil);
  1252.                                           test_flush;
  1253.                                         }
  1254.                                       }
  1255.                                     }
  1256.                                   }
  1257.                                 }
  1258.                               }
  1259.                             }
  1260.                           }
  1261.                         }
  1262.                       }
  1263.  
  1264.                                             // entrΘe dans une applet javascript
  1265.                       /*if (!inscript) {  // sinon on est dans un obj.write("..
  1266.                       if (p==0)
  1267.                       if (rech_sampletag(adr,"script"))
  1268.                       if (check_tag(intag_start,"script")) {
  1269.                       inscript=1;
  1270.                       }
  1271.                       }*/
  1272.  
  1273.                       // Ici on procΦde α une analyse du code javascript pour tenter de rΘcupΘrer
  1274.                       // certains fichiers Θvidents.
  1275.                       // C'est devenu obligatoire vu le nombre de pages qui intΦgrent
  1276.                       // des images rΘactives par exemple
  1277.                   }
  1278.                 } else if (inscript) {
  1279.  
  1280. #if 0
  1281.                   /* Check // javascript comments */
  1282.                   if (*adr == 10 || *adr == 13) {
  1283.                     inscript_check_comments = 1;
  1284.                     inscript_in_comments = 0;
  1285.                   }
  1286.                   else if (inscript_check_comments) {
  1287.                     if (!is_realspace(*adr)) {
  1288.                       inscript_check_comments = 0;
  1289.                       if (adr[0] == '/' && adr[1] == '/') {
  1290.                         inscript_in_comments = 1;
  1291.                       }
  1292.                     }
  1293.                   }
  1294. #endif
  1295.  
  1296.                   /* Parse */
  1297.                   assertf(inscript_name != NULL);
  1298.                   if (
  1299.                     *adr == '/' &&
  1300.                     (
  1301.                     (strfield(adr,"/script") && strfield(inscript_name, "script"))
  1302.                     ||
  1303.                     (strfield(adr,"/style")  && strfield(inscript_name, "style"))
  1304.                     )
  1305.                     ) {
  1306.                       char* a=adr;
  1307.                       //while(is_realspace(*(--a)));
  1308.                       while( is_realspace(*a) ) a--;
  1309.                       a--;
  1310.                       if (*a=='<') {  // s√r que c'est un tag?
  1311.                         inscript=0;
  1312.                         if (opt->parsedebug) { HT_ADD("<@@ /inscript @@>"); }
  1313.                       }
  1314.                     } else if (inscript_state_pos == INSCRIPT_START /*!inscript_in_comments*/) {
  1315.                       /*
  1316.                       Script Analyzing - different types supported:
  1317.                       foo="url"
  1318.                       foo("url") or foo(url)
  1319.                       foo "url"
  1320.                       */
  1321.                       char  expected     = '=';          // caractΦre attendu aprΦs
  1322.                       char* expected_end = ";";
  1323.                       int can_avoid_quotes=0;
  1324.                       char quotes_replacement='\0';
  1325.                       int ensure_not_mime=0;
  1326.                       if (inscript_tag)
  1327.                         expected_end=";\"\'";            // voir a href="javascript:doc.location='foo'"
  1328.  
  1329.                       /* Can we parse javascript ? */
  1330.                       if ( (opt->parsejava & HTSPARSE_NO_JAVASCRIPT) == 0) {
  1331.                         int nc;
  1332.                         nc = strfield(adr,".src");  // nom.src="image";
  1333.                         if (!nc) nc = strfield(adr,".location");  // document.location="doc"
  1334.                         if (!nc) nc = strfield(adr,":location");  // javascript:location="doc"
  1335.                         if (!nc) { // location="doc"
  1336.                           if ( ( nc = strfield(adr,"location") ) 
  1337.                             && !isspace(*(adr - 1))
  1338.                             )
  1339.                             nc = 0;
  1340.                         }
  1341.                         if (!nc) nc = strfield(adr,".href");  // document.location="doc"
  1342.                         if (!nc) if ( (nc = strfield(adr,".open")) ) { // window.open("doc",..
  1343.                           expected='(';    // parenthΦse
  1344.                           expected_end="),";  // fin: virgule ou parenthΦse
  1345.                           ensure_not_mime=1;  //* ensure the url is not a mime type */
  1346.                         }
  1347.                         if (!nc) if ( (nc = strfield(adr,".replace")) ) { // window.replace("url")
  1348.                           expected='(';    // parenthΦse
  1349.                           expected_end=")";  // fin: parenthΦse
  1350.                         }
  1351.                         if (!nc) if ( (nc = strfield(adr,".link")) ) { // window.link("url")
  1352.                           expected='(';    // parenthΦse
  1353.                           expected_end=")";  // fin: parenthΦse
  1354.                         }
  1355.                         if (!nc && (nc = strfield(adr,"url")) && (!isalnum(*(adr - 1))) && *(adr - 1) != '_') { // url(url)
  1356.                           expected='(';    // parenthΦse
  1357.                           expected_end=")";  // fin: parenthΦse
  1358.                           can_avoid_quotes=1;
  1359.                           quotes_replacement=')';
  1360.                         } else {
  1361.                           nc = 0;
  1362.                         }
  1363.                         if (!nc) if ( (nc = strfield(adr,"import")) ) { // import "url"
  1364.                           if (is_space(*(adr+nc))) {
  1365.                             expected=0;    // no char expected
  1366.                           } else
  1367.                             nc=0;
  1368.                         }
  1369.                         if (nc) {
  1370.                           char *a;
  1371.                           a=adr+nc;
  1372.                           while(is_realspace(*a)) a++;
  1373.                           if ((*a == expected) || (!expected)) {
  1374.                             if (expected)
  1375.                               a++;
  1376.                             while(is_realspace(*a)) a++;
  1377.                             if ((*a==34) || (*a=='\'') || (can_avoid_quotes)) {
  1378.                               char *b,*c;
  1379.                               int ndelim=1;
  1380.                               if ((*a==34) || (*a=='\''))
  1381.                                 a++;
  1382.                               else
  1383.                                 ndelim=0;
  1384.                               b=a;
  1385.                               if (ndelim) {
  1386.                                 while((*b!=34) && (*b!='\'') && (*b!='\0')) b++;
  1387.                               }
  1388.                               else {
  1389.                                 while((*b != quotes_replacement) && (*b!='\0')) b++;
  1390.                               }
  1391.                               c=b--; c+=ndelim;
  1392.                               while(*c==' ') c++;
  1393.                               if ((strchr(expected_end,*c)) || (*c=='\n') || (*c=='\r')) {
  1394.                                 c-=(ndelim+1);
  1395.                                 if ((int) (c - a + 1)) {
  1396.                                   if (ensure_not_mime) {
  1397.                                     int i = 0;
  1398.                                     while(a != NULL && hts_main_mime[i] != NULL && hts_main_mime[i][0] != '\0') {
  1399.                                       int p;
  1400.                                       if ((p=strfield(a, hts_main_mime[i])) && a[p] == '/') {
  1401.                                         a=NULL;
  1402.                                       }
  1403.                                       i++;
  1404.                                     }
  1405.                                   }
  1406.                                   if (a != NULL) {
  1407.                                     if ((opt->debug>1) && (opt->log!=NULL)) {
  1408.                                       char str[512];
  1409.                                       str[0]='\0';
  1410.                                       strncatbuff(str,a,minimum((int) (c - a + 1),32));
  1411.                                       HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"link detected in javascript: %s"LF,str); test_flush;
  1412.                                     }
  1413.                                     p=(int) (a - adr);    // p non nul: TRAITER CHAINE COMME FICHIER
  1414.                                     if (can_avoid_quotes) {
  1415.                                       ending_p=quotes_replacement;
  1416.                                     }
  1417.                                   }
  1418.                                 }
  1419.                               }
  1420.  
  1421.  
  1422.                             }
  1423.                           }
  1424.                         }
  1425.  
  1426.                       }  /* HTSPARSE_NO_JAVASCRIPT */
  1427.  
  1428.                     }
  1429.                 }
  1430.               }
  1431.  
  1432.             } else {      // ptr == 0
  1433.               //p=rech_tageq(adr,"primary");    // lien primaire, yeah
  1434.               p=0;          // No stupid tag anymore, raw link
  1435.               valid_p=1;    // Valid even if p==0
  1436.               while ((adr[p] == '\r') || (adr[p] == '\n'))
  1437.                 p++;
  1438.               //can_avoid_quotes=1;
  1439.               ending_p='\r';
  1440.             }       
  1441.  
  1442.           } else if (isspace((unsigned char)*adr)) {
  1443.             intag_startattr=adr+1;        // attribute in tag (for dirty parsing)
  1444.           }
  1445.  
  1446.  
  1447.           // ------------------------------------------------------------
  1448.           // dernier recours - parsing "sale" : dΘtection systΘmatique des .gif, etc.
  1449.           // risque: gΘnΘrer de faux fichiers parazites
  1450.           // fix: ne parse plus dans les commentaires
  1451.           // ------------------------------------------------------------
  1452.           if ( opt->parseall && (opt->parsejava & HTSPARSE_NO_AGGRESSIVE) == 0 
  1453.             && (ptr>0) && (!in_media) /* && (!inscript_in_comments)*/ ) {   // option parsing "brut"
  1454.             //int incomment_justquit=0;
  1455.             if (!is_realspace(*adr)) {
  1456.               int noparse=0;
  1457.  
  1458.               // Gestion des /* */
  1459. #if 0
  1460.               if (inscript) {
  1461.                 if (parseall_incomment) {
  1462.                   if ((*adr=='/') && (*(adr-1)=='*'))
  1463.                     parseall_incomment=0;
  1464.                   incomment_justquit=1;       // ne pas noter dernier caractΦre
  1465.                 } else {
  1466.                   if ((*adr=='/') && (*(adr+1)=='*'))
  1467.                     parseall_incomment=1;
  1468.                 }
  1469.               } else
  1470.                 parseall_incomment=0;
  1471. #endif
  1472.               /* ensure automate state  0 (not in comments, quotes..) */
  1473.               if (inscript && ( 
  1474.                 inscript_state_pos != INSCRIPT_INQUOTE && inscript_state_pos != INSCRIPT_INQUOTE2
  1475.                 ) ) {
  1476.                   noparse=1;
  1477.                 }
  1478.  
  1479.                 /* vΘrifier que l'on est pas dans un <!-- --> pur */
  1480.                 if ( (!intag) && (incomment) && (!inscript))
  1481.                   noparse=1;        /* commentaire */
  1482.  
  1483.                 // recherche d'URLs
  1484.                 if (!noparse) {
  1485.                   //if ((!parseall_incomment) && (!noparse)) {
  1486.                   if (!p) {                   // non dΘja trouvΘ
  1487.                     if (adr != r->adr) {     // >1 caractΦre
  1488.                       // scanner les chaines
  1489.                       if ((*adr == '\"') || (*adr=='\'')) {         // "xx.gif" 'xx.gif'
  1490.                         if (strchr("=(,",parseall_lastc)) {    // exemple: a="img.gif.. (handles comments)
  1491.                           char *a=adr;
  1492.                           char stop=*adr;  // " ou '
  1493.                           int count=0;
  1494.  
  1495.                           // sauter caractΦres
  1496.                           a++;
  1497.                           // copier
  1498.                           while((*a) && (*a!='\'') && (*a!='\"') && (count<HTS_URLMAXSIZE)) { count++; a++; }
  1499.  
  1500.                           // ok chaine terminΘe par " ou '
  1501.                           if ((*a == stop) && (count<HTS_URLMAXSIZE) && (count>0)) {
  1502.                             char c;
  1503.                             char* aend;
  1504.                             //
  1505.                             aend=a;     // sauver dΘbut
  1506.                             a++;
  1507.                             while(is_taborspace(*a)) a++;
  1508.                             c=*a;
  1509.                             if (strchr("),;>/+\r\n",c)) {     // exemple: ..img.gif";
  1510.                               // le / est pour funct("img.gif" /* URL */);
  1511.                               char BIGSTK tempo[HTS_URLMAXSIZE*2];
  1512.                               char type[256];
  1513.                               int url_ok=0;      // url valide?
  1514.                               tempo[0]='\0'; type[0]='\0';
  1515.                               //
  1516.                               strncatbuff(tempo,adr+1,count);
  1517.                               //
  1518.                               if ((!strchr(tempo,' ')) || inscript) {   // espace dedans: mΘfiance! (sauf dans code javascript)
  1519.                                 int invalid_url=0;
  1520.  
  1521.                                 // escape                              
  1522.                                 unescape_amp(tempo);
  1523.  
  1524.                                 // Couper au # ou ? Θventuel
  1525.                                 {
  1526.                                   char* a=strchr(tempo,'#');
  1527.                                   if (a)
  1528.                                     *a='\0';
  1529.                                   a=strchr(tempo,'?');
  1530.                                   if (a)
  1531.                                     *a='\0';
  1532.                                 }
  1533.  
  1534.                                 // vΘrifier qu'il n'y a pas de caractΦres spΘciaux
  1535.                                 if (!strnotempty(tempo))
  1536.                                   invalid_url=1;
  1537.                                 else if (strchr(tempo,'*')
  1538.                                   || strchr(tempo,'<')
  1539.                                   || strchr(tempo,'>')
  1540.                                   || strchr(tempo,',')    /* list of files ? */
  1541.                                   || strchr(tempo,'\"')    /* potential parsing bug */
  1542.                                   || strchr(tempo,'\'')    /* potential parsing bug */
  1543.                                   )
  1544.                                   invalid_url=1;
  1545.                                 else if (tempo[0] == '.' && isalnum(tempo[1]))   // ".gif"
  1546.                                   invalid_url=1;
  1547.  
  1548.                                 /* non invalide? */
  1549.                                 if (!invalid_url) {
  1550.                                   // Un plus α la fin? Alors ne pas prendre sauf si extension ("/toto.html#"+tag)
  1551.                                   if (c!='+') {    // PAS de plus α la fin
  1552. #if 0
  1553.                                     char* a;
  1554. #endif
  1555.                                     // "Comparisons of scheme names MUST be case-insensitive" (RFC2616)                                  
  1556.                                     if (
  1557.                                       (strfield(tempo,"http:")) 
  1558.                                       || (strfield(tempo,"ftp:"))
  1559. #if HTS_USEOPENSSL
  1560.                                       || (
  1561.                                       SSL_is_available &&
  1562.                                       (strfield(tempo,"https:"))
  1563.                                       )
  1564. #endif
  1565. #if HTS_USEMMS
  1566.                                                                             || strfield(tempo,"mms:")
  1567. #endif
  1568.                                       )  // ok pas de problΦme
  1569.                                       url_ok=1;
  1570.                                     else if (tempo[strlen(tempo)-1]=='/') {        // un slash: ok..
  1571.                                       if (inscript)   // sinon si pas javascript, mΘfiance (rΘpertoire style base?)
  1572.                                         url_ok=1;
  1573.                                     } 
  1574. #if 0
  1575.                                     else if ((a=strchr(tempo,'/'))) {        // un slash: ok..
  1576.                                       if (inscript) {    // sinon si pas javascript, mΘfiance (style "text/css")
  1577.                                         if (strchr(a+1,'/'))     // un seul / : abandon (STYLE type='text/css')
  1578.                                           if (!strchr(tempo,' '))  // avoid spaces (too dangerous for comments)
  1579.                                             url_ok=1;
  1580.                                       }
  1581.                                     }
  1582. #endif
  1583.                                   }
  1584.                                   // Prendre si extension reconnue
  1585.                                   if (!url_ok) {
  1586.                                     get_httptype(opt,type,tempo,0);
  1587.                                     if (strnotempty(type))     // type reconnu!
  1588.                                       url_ok=1;
  1589.                                     else if (is_dyntype(get_ext(OPT_GET_BUFF(opt),tempo)))  // reconnu php,cgi,asp..
  1590.                                       url_ok=1;
  1591.                                     // MAIS pas les foobar@aol.com !!
  1592.                                     if (strchr(tempo,'@'))
  1593.                                       url_ok=0;
  1594.                                   }
  1595.                                   //
  1596.                                   // Ok, cela pourrait Ωtre une URL
  1597.                                   if (url_ok) {
  1598.  
  1599.                                     // Check if not fodbidden tag (id,name..)
  1600.                                     if (intag_start_valid) {
  1601.                                       if (intag_start)
  1602.                                         if (intag_startattr)
  1603.                                           if (intag)
  1604.                                             if (!inscript)
  1605.                                               if (!incomment) {
  1606.                                                 int i=0,nop=0;
  1607.                                                 while( (nop==0) && (strnotempty(hts_nodetect[i])) ) {
  1608.                                                   nop=rech_tageq(intag_startattr,hts_nodetect[i]);
  1609.                                                   i++;
  1610.                                                 }
  1611.                                                 // Forbidden tag
  1612.                                                 if (nop) {
  1613.                                                   url_ok=0;
  1614.                                                   if ((opt->debug>1) && (opt->log!=NULL)) {
  1615.                                                     HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"dirty parsing: bad tag avoided: %s"LF,hts_nodetect[i-1]); test_flush;
  1616.                                                   }
  1617.                                                 }
  1618.                                               }
  1619.                                     }
  1620.  
  1621.  
  1622.                                     // Accepter URL, on la traitera comme une URL normale!!
  1623.                                     if (url_ok) {
  1624.                                       valid_p = 1;
  1625.                                       p = 0;
  1626.                                     }
  1627.  
  1628.                                   }
  1629.                                 }
  1630.                               }
  1631.                             }
  1632.                           }
  1633.                         }
  1634.                       }
  1635.                     }
  1636.                   }  // p == 0               
  1637.  
  1638.                 } // not in comment
  1639.  
  1640.                 // plus dans un commentaire
  1641.                 if ( inscript_state_pos == INSCRIPT_START 
  1642.                   && inscript_state_pos_prev == INSCRIPT_START) {
  1643.                     parseall_lastc=*adr;             // caractΦre avant le prochain
  1644.                   }
  1645.  
  1646.  
  1647.             }  // if realspace
  1648.           }  // if parseall
  1649.  
  1650.  
  1651.           // ------------------------------------------------------------
  1652.           // p!=0 : on a repΘrΘ un Θventuel lien
  1653.           // ------------------------------------------------------------
  1654.           //
  1655.           if ((p>0) || (valid_p)) {    // on a repΘrΘ un lien
  1656.             //int lien_valide=0;
  1657.             char* eadr=NULL;          /* fin de l'URL */
  1658.             char* quote_adr=NULL;     /* adresse du ? dans l'adresse */
  1659.             int ok=1;
  1660.             char quote='\0';
  1661.             int quoteinscript=0;
  1662.             int  noquote=0;
  1663.                         char *tag_attr_start = adr;
  1664.  
  1665.             // si nofollow ou un stop a ΘtΘ dΘclenchΘ, rΘΘcrire tous les liens en externe
  1666.             if ((nofollow) || (opt->state.stop))
  1667.               p_nocatch=1;
  1668.  
  1669.             // Θcrire codebase avant, flusher avant code
  1670.             if ((p_type==-1) || (p_type==-2)) {
  1671.               if ((opt->getmode & 1) && (ptr>0)) {
  1672.                 HT_ADD_ADR;    // refresh
  1673.               }
  1674.               lastsaved=adr;    // dernier Θcrit+1
  1675.             }
  1676.  
  1677.             // sauter espaces
  1678.             // adr+=p;
  1679.             INCREMENT_CURRENT_ADR(p);
  1680.             while( ( is_space(*adr) || (
  1681.               inscriptgen 
  1682.               && adr[0] == '\\' 
  1683.               && is_space(adr[1])
  1684.               )
  1685.               )
  1686.               && quote == '\0'
  1687.               ) {
  1688.                 if (!quote)
  1689.                   if ((*adr=='\"') || (*adr=='\'')) {
  1690.                     quote=*adr;                     // on doit attendre cela α la fin
  1691.                     if (inscriptgen && *(adr - 1) == '\\') {
  1692.                       quoteinscript=1;  /* will wait for \" */
  1693.                     }
  1694.                   }
  1695.                   // puis quitter
  1696.                   // adr++;    // sauter les espaces, "" et cie
  1697.                   INCREMENT_CURRENT_ADR(1);
  1698.               }
  1699.  
  1700.               /* Stop at \n (LF) if primary links or link lists */
  1701.               if (ptr == 0 || (in_media && strcmp(in_media,"LNK")==0))
  1702.                 quote='\n';
  1703.               /* s'arrΩter que ce soit un ' ou un " : pour document.write('<img src="foo'+a); par exemple! */
  1704.               else if (inscript && ! unquoted_script)
  1705.                 noquote=1;
  1706.  
  1707.               // sauter Θventuel \" ou \' javascript
  1708.               if (inscript) {    // on est dans un obj.write("..
  1709.                 if (*adr=='\\') {
  1710.                   if ((*(adr+1)=='\'') || (*(adr+1)=='"')) {  // \" ou \'
  1711.                     // adr+=2;    // sauter
  1712.                     INCREMENT_CURRENT_ADR(2);
  1713.                   }
  1714.                 }
  1715.               }
  1716.  
  1717.               // sauter content="1;URL=http://..
  1718.               if (p_searchMETAURL) {
  1719.                 int l=0;
  1720.                 while(
  1721.                   (adr + l + 4 < r->adr + r->size)
  1722.                   && (!strfield(adr+l,"URL=")) 
  1723.                   && (l<128) ) l++;
  1724.                 if (!strfield(adr+l,"URL="))
  1725.                   ok=-1;
  1726.                 else
  1727.                   adr+=(l+4);
  1728.               }
  1729.  
  1730.               /* Θviter les javascript:document.location=.. : les parser, plut⌠t */
  1731.               if (ok!=-1) {
  1732.                 if (strfield(adr,"javascript:") 
  1733.                   && ! inscript       /* we don't want to parse 'javascript:' inside document.write inside scripts */
  1734.                   ) {
  1735.                     ok=-1;
  1736.                     /*
  1737.                     On est dΘsormais dans du code javascript
  1738.                     */
  1739.                     inscript_name="";
  1740.                     inscript_tag=inscript=1;
  1741.                     inscript_state_pos=INSCRIPT_START;
  1742.                     inscript_tag_lastc=quote;     /* α attendre α la fin */
  1743.                     if (opt->parsedebug) { HT_ADD("<@@ inscript @@>"); }
  1744.                   }
  1745.               }
  1746.  
  1747.               if (p_type==1) {
  1748.                 if (*adr=='#') {
  1749.                   adr++;           // sauter # pour usemap etc
  1750.                 }
  1751.               }
  1752.               eadr=adr;
  1753.  
  1754.               // ne pas flusher aprΦs code si on doit Θcrire le codebase avant!
  1755.               if ((p_type!=-1) && (p_type!=2) && (p_type!=-2)) {
  1756.                 if ((opt->getmode & 1) && (ptr>0)) {
  1757.                   HT_ADD_ADR;    // refresh
  1758.                 }
  1759.                 lastsaved=adr;    // dernier Θcrit+1
  1760.                 // aprΦs on Θcrira soit les donnΘes initiales,
  1761.                 // soir une URL/lien modifiΘ!
  1762.               } else if (p_type==-1) p_flush=adr;    // flusher jusqu'α adr ensuite
  1763.  
  1764.               if (ok!=-1) {    // continuer
  1765.                 // dΘcouper le lien
  1766.                 do {
  1767.                   if ((* (unsigned char*) eadr)<32) {   // caractΦre de contr⌠le (ou \0)
  1768.                     if (!is_space(*eadr))
  1769.                       ok=0; 
  1770.                   }
  1771.                   if ( ( ((int) (eadr - adr)) ) > HTS_URLMAXSIZE)  // ** trop long, >HTS_URLMAXSIZE caractΦres (on prΘvoit HTS_URLMAXSIZE autres pour path)
  1772.                     ok=-1;    // ne pas traiter ce lien
  1773.  
  1774.                   if (ok > 0) {
  1775.                     //if (*eadr!=' ') {  
  1776.                     if (is_space(*eadr)) {   // guillemets,CR, etc
  1777.                       if ( 
  1778.                         ( *eadr == quote && ( !quoteinscript || *(eadr -1) == '\\') )  // end quote
  1779.                         || ( noquote && (*eadr == '\"' || *eadr == '\'') )       // end at any quote
  1780.                         || (!noquote && quote == '\0' && is_realspace(*eadr) )   // unquoted href
  1781.                         )     // si pas d'attente de quote spΘciale ou si quote atteinte
  1782.                         ok=0; 
  1783.                     } else if (ending_p && (*eadr==ending_p))
  1784.                       ok=0;
  1785.                     else {
  1786.                       switch(*eadr) {
  1787.                     case '>': 
  1788.                       if (!quote) {
  1789.                         if (!inscript && !in_media) {
  1790.                           intag=0;    // PLUS dans un tag!
  1791.                           intag_start_valid=0;
  1792.                                                     intag_name = NULL;
  1793.                         }
  1794.                         ok=0;
  1795.                       }
  1796.                       break;
  1797.                       /*case '<':*/ 
  1798.                     case '#': 
  1799.                       if (*(eadr-1) != '&')       // (
  1800.                         ok=0; 
  1801.                       break;
  1802.                       // case '?': non!
  1803.                     case '\\': if (inscript) ok=0; break;     // \" ou \' point d'arrΩt
  1804.                     case '?': quote_adr=adr; break;           // noter position query
  1805.                       }
  1806.                     }
  1807.                     //}
  1808.                   } 
  1809.                   eadr++;
  1810.                 } while(ok==1);
  1811.  
  1812.                 // Empty link detected
  1813.                 if ( (((int) (eadr - adr))) <= 1) {       // link empty
  1814.                   ok=-1;        // No
  1815.                   if (*adr != '#') {        // Not empty+unique #
  1816.                     if ( (((int) (eadr - adr)) == 1)) {       // 1=link empty with delim (end_adr-start_adr)
  1817.                       if (quote) {
  1818.                         if ((opt->getmode & 1) && (ptr>0)) { 
  1819.                           HT_ADD("#");        // We add this for a <href="">
  1820.                         }
  1821.                       }
  1822.                     }
  1823.                   }
  1824.                 }
  1825.  
  1826.                 // This is a dirty and horrible hack to avoid parsing an Adobe GoLive bogus tag
  1827.                 if (strfield(adr, "(Empty Reference!)")) {
  1828.                   ok=-1;        // No
  1829.                 }
  1830.  
  1831.               }
  1832.  
  1833.               if (ok==0) {    // tester un lien
  1834.                 char BIGSTK lien[HTS_URLMAXSIZE*2];
  1835.                 int meme_adresse=0;      // 0 par dΘfaut pour primary
  1836.                 //char *copie_de_adr=adr;
  1837.                 //char* p;
  1838.  
  1839.                 // construire lien (dΘcoupage)
  1840.                 if ( (((int) (eadr -  adr))-1) < HTS_URLMAXSIZE  ) {    // pas trop long?
  1841.                   strncpy(lien,adr,((int) (eadr - adr))-1);
  1842.                   *(lien+  (((int) (eadr -  adr)))-1  )='\0';
  1843.                   //printf("link: %s\n",lien);          
  1844.                   // supprimer les espaces
  1845.                   while((lien[strlen(lien)-1]==' ') && (strnotempty(lien))) lien[strlen(lien)-1]='\0';
  1846.  
  1847.  
  1848.                 } else
  1849.                   lien[0]='\0';    // erreur
  1850.  
  1851.  
  1852.                 // ------------------------------------------------------
  1853.                 // Lien repΘrΘ et extrait
  1854.                 if (strnotempty(lien)>0) {           // construction du lien
  1855.                   char BIGSTK adr[HTS_URLMAXSIZE*2],fil[HTS_URLMAXSIZE*2];          // ATTENTION adr cache le "vrai" adr
  1856.                   int forbidden_url=-1;              // lien non interdit (mais non autorisΘ..)
  1857.                   int just_test_it=0;                // mode de test des liens
  1858.                   int set_prio_to=0;                 // pour capture de page isolΘe
  1859.                   int import_done=0;                 // lien importΘ (ne pas scanner ensuite *α priori*)
  1860.                   //
  1861.                   adr[0]='\0'; fil[0]='\0';
  1862.                   //
  1863.                   // 0: autorisΘ
  1864.                   // 1: interdit (patcher tout de mΩme adresse)
  1865.  
  1866.                   if ((opt->debug>1) && (opt->log!=NULL)) {
  1867.                     HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"link detected in html (tag): %s"LF,lien); test_flush;
  1868.                   }
  1869.  
  1870.                   // external check
  1871.                   if (!RUN_CALLBACK1(opt, linkdetected, lien) || !RUN_CALLBACK2(opt, linkdetected2, lien, intag_start)) {
  1872.                     error=1;    // erreur
  1873.                     if (opt->log) {
  1874.                       HTS_LOG(opt,LOG_ERROR); fprintf(opt->log,"Link %s refused by external wrapper"LF,lien);
  1875.                       test_flush;
  1876.                     }
  1877.                   }
  1878.  
  1879. #if HTS_STRIP_DOUBLE_SLASH
  1880.                   // supprimer les // en / (sauf pour http://)
  1881.                   if (opt->urlhack) {
  1882.                     char *a,*p,*q;
  1883.                     int done=0;
  1884.                     a=strchr(lien,':');    // http://
  1885.                     if (a) {
  1886.                       a++;
  1887.                       while(*a=='/') a++;    // position aprΦs http://
  1888.                     } else {
  1889.                       a=lien;                // dΘbut
  1890.                       while(*a=='/') a++;    // position aprΦs http://
  1891.                     }
  1892.                     q=strchr(a,'?');     // ne pas traiter aprΦs '?'
  1893.                     if (!q)
  1894.                       q=a+strlen(a)-1;
  1895.                     while(( p=strstr(a,"//")) && (!done) ) {    // remplacer // par /
  1896.                       if ((int) p>(int) q) {   // aprΦs le ? (toto.cgi?param=1//2.3)
  1897.                         done=1;    // stopper
  1898.                       } else {
  1899.                         char BIGSTK tempo[HTS_URLMAXSIZE*2];
  1900.                         tempo[0]='\0';
  1901.                         strncatbuff(tempo,a,(int) p - (int) a);
  1902.                         strcatbuff (tempo,p+1);
  1903.                         strcpybuff(a,tempo);    // recopier
  1904.                       }
  1905.                     }
  1906.                   }
  1907. #endif
  1908.  
  1909.                   // purger espaces de dΘbut et fin, CR,LF rΘsiduels
  1910.                   // (IMG SRC="foo.<\n><\t>gif<\t>")
  1911.                   {
  1912.                     char* a = lien;
  1913.                     size_t llen;
  1914.  
  1915.                     // strip ending spaces
  1916.                     llen = ( *a != '\0' ) ? strlen(a) : 0;
  1917.                     while(llen > 0 && is_realspace(lien[llen - 1]) ) {
  1918.                       a[--llen]='\0';
  1919.                     } 
  1920.                     //  skip leading ones
  1921.                     while(is_realspace(*a)) a++;
  1922.                     // strip cr, lf, tab inside URL
  1923.                     llen = 0;
  1924.                     while(*a) {
  1925.                       if (*a != '\n' && *a != '\r' && *a != '\t') {
  1926.                         lien[llen++] = *a;
  1927.                       }
  1928.                       a++;
  1929.                     }
  1930.                     lien[llen] = '\0';
  1931.                   }
  1932.  
  1933.                   // commas are forbidden
  1934.                   if (archivetag_p) {
  1935.                     if (strchr(lien, ',')) {
  1936.                       error=1;    // erreur
  1937.                       if ((opt->debug>1) && (opt->log!=NULL)) {
  1938.                         HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"link rejected (multiple-archive) %s"LF,lien); test_flush;
  1939.                       }
  1940.                     }
  1941.                   }               
  1942.  
  1943.                   /* Unescape/escape %20 and other   */
  1944.                   {
  1945.                     char BIGSTK query[HTS_URLMAXSIZE*2];
  1946.                     char* a=strchr(lien,'?');
  1947.                     if (a) {
  1948.                       strcpybuff(query,a);
  1949.                       *a='\0';
  1950.                     } else
  1951.                       query[0]='\0';
  1952.                     // conversion & -> & et autres joyeusetΘs
  1953.                     unescape_amp(lien);
  1954.                     unescape_amp(query);
  1955.                     // dΘcoder l'inutile (%2E par exemple) et coder espaces
  1956.                     // XXXXXXXXXXXXXXXXX strcpybuff(lien,unescape_http(lien));
  1957.                     //strcpybuff(lien,unescape_http_unharm(lien, (no_esc_utf)?0:1));
  1958.                                         /* Never unescape high-chars (we don't know the encoding!!) */
  1959.                     strcpybuff(lien,unescape_http_unharm(catbuff,lien, 1));   /* note: '%' is still escaped */
  1960.                     escape_remove_control(lien);
  1961.                     escape_spc_url(lien);
  1962.                     strcatbuff(lien,query);     /* restore */
  1963.                   }
  1964.  
  1965.                   // convertir les Θventuels \ en des / pour Θviter des problΦmes de reconnaissance!
  1966.                   {
  1967.                     char* a;
  1968.                     for(a = jump_identification(lien) ; *a != '\0' && *a != '?' ; a++) {
  1969.                       if (*a == '\\') {
  1970.                         *a = '/';
  1971.                       }
  1972.                     }
  1973.                   }
  1974.  
  1975.                   // supprimer le(s) ./
  1976.                   while ((lien[0]=='.') && (lien[1]=='/')) {
  1977.                     char BIGSTK tempo[HTS_URLMAXSIZE*2];
  1978.                     strcpybuff(tempo,lien+2);
  1979.                     strcpybuff(lien,tempo);
  1980.                   }
  1981.                   if (strnotempty(lien)==0)  // sauf si plus de nom de fichier
  1982.                     strcpybuff(lien,"./");
  1983.  
  1984.                   // vΘrifie les /~machin -> /~machin/
  1985.                   // supposition dangereuse?
  1986.                   // OUI!!
  1987. #if HTS_TILDE_SLASH
  1988.                   if (lien[strlen(lien)-1]!='/') {
  1989.                     char *a=lien+strlen(lien)-1;
  1990.                     // Θviter aussi index~1.html
  1991.                     while (((int) a>(int) lien) && (*a!='~') && (*a!='/') && (*a!='.')) a--;
  1992.                     if (*a=='~') {
  1993.                       strcatbuff(lien,"/");    // ajouter slash
  1994.                     }
  1995.                   }
  1996. #endif
  1997.  
  1998.                   // APPLET CODE="mixer.MixerApplet.class" --> APPLET CODE="mixer/MixerApplet.class"
  1999.                   // yes, this is dirty
  2000.                   // but I'm so lazzy..
  2001.                   // and besides the java "code" convention is really a pain in html code
  2002.                   if (p_type==-1) {
  2003.                     char* a=strrchr(lien,'.');
  2004.                     add_class_dots_to_patch=0;
  2005.                     if (a) {
  2006.                       char* b;
  2007.                       do {
  2008.                         b=strchr(lien,'.');
  2009.                         if ((b != a) && (b)) {
  2010.                           add_class_dots_to_patch++;
  2011.                           *b='/';
  2012.                         }
  2013.                       } while((b != a) && (b));
  2014.                     }
  2015.                   }
  2016.  
  2017.                   // Θliminer les Θventuels :80 (port par dΘfaut!)
  2018.                   if (link_has_authority(lien)) {
  2019.                     char * a;
  2020.                     a=strstr(lien,"//");    // "//" authority
  2021.                     if (a)
  2022.                       a+=2;
  2023.                     else
  2024.                       a=lien;
  2025.                     // while((*a) && (*a!='/') && (*a!=':')) a++;
  2026.                     a=jump_toport(a);
  2027.                     if (a) {  // port
  2028.                       int port=0;
  2029.                       int defport=80;
  2030.                       char* b=a+1;
  2031. #if HTS_USEOPENSSL
  2032.                       // FIXME
  2033.                       //if (strfield(adr, "https:")) {
  2034.                       //}
  2035. #endif
  2036.                       while(isdigit((unsigned char)*b)) { port*=10; port+=(int) (*b-'0'); b++; }
  2037.                       if (port==defport) {  // port 80, default - c'est dΘbile
  2038.                         char BIGSTK tempo[HTS_URLMAXSIZE*2];
  2039.                         tempo[0]='\0';
  2040.                         strncatbuff(tempo,lien,(int) (a - lien));
  2041.                         strcatbuff(tempo,a+3);  // sauter :80
  2042.                         strcpybuff(lien,tempo);
  2043.                       }
  2044.                     }
  2045.                   }
  2046.  
  2047.                   // filtrer les parazites (mailto & cie)
  2048.                   /*
  2049.                   if (strfield(lien,"mailto:")) {  // ne pas traiter
  2050.                   error=1;
  2051.                   } else if (strfield(lien,"news:")) {  // ne pas traiter
  2052.                   error=1;
  2053.                   }
  2054.                   */
  2055.  
  2056.                   // vΘrifier que l'on ne doit pas ajouter de .class
  2057.                   if (!error) {
  2058.                     if (add_class) {
  2059.                       char *a = lien+strlen(lien)-1;
  2060.                       while(( a > lien) && (*a!='/') && (*a!='.')) a--;
  2061.                       if (*a != '.')
  2062.                         strcatbuff(lien,".class");    // ajouter .class
  2063.                       else if (!strfield2(a,".class"))
  2064.                         strcatbuff(lien,".class");    // idem
  2065.                     }
  2066.                   }
  2067.  
  2068.                   // si c'est un chemin, alors vΘrifier (toto/toto.html -> http://www/toto/)
  2069.                   if (!error) {
  2070.                     if ((opt->debug>1) && (opt->log!=NULL)) {
  2071.                       HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"position link check %s"LF,lien); test_flush;
  2072.                     }
  2073.  
  2074.                     if ((p_type==2) || (p_type==-2)) {   // code ou codebase                        
  2075.                       // VΘrifier les codebase=applet (au lieu de applet/)
  2076.                       if (p_type==-2) {    // codebase
  2077.                         if (strnotempty(lien)) {
  2078.                           if (fil[strlen(lien)-1]!='/') {  // pas rΘpertoire
  2079.                             strcatbuff(lien,"/");
  2080.                           }
  2081.                         }
  2082.                       }
  2083.  
  2084.                       /* base has always authority */
  2085.                       if (p_type==2 && !link_has_authority(lien)) {
  2086.                         char BIGSTK tmp[HTS_URLMAXSIZE*2];
  2087.                         strcpybuff(tmp, "http://");
  2088.                         strcatbuff(tmp, lien);
  2089.                         strcpybuff(lien, tmp);
  2090.                       }
  2091.  
  2092.                       /* only one ending / (bug on some pages) */
  2093.                       if ((int)strlen(lien)>2) {
  2094.                         int len = (int) strlen(lien);
  2095.                         while(len > 1 && lien[len-1] == '/' && lien[len-2] == '/' )    /* double // (bug) */
  2096.                           lien[--len]='\0';
  2097.                       }
  2098.                       // copier nom host si besoin est
  2099.                       if (!link_has_authority(lien)) {  // pas de http://
  2100.                         char BIGSTK adr2[HTS_URLMAXSIZE*2],fil2[HTS_URLMAXSIZE*2];  // ** euh ident_url_relatif??
  2101.                         if (ident_url_relatif(lien,urladr,urlfil,adr2,fil2)<0) {                        
  2102.                           error=1;
  2103.                         } else {
  2104.                           strcpybuff(lien,"http://");
  2105.                           strcatbuff(lien,adr2);
  2106.                           if (*fil2!='/')
  2107.                             strcatbuff(lien,"/");
  2108.                           strcatbuff(lien,fil2);
  2109.                           {
  2110.                             char* a;
  2111.                             a=lien+strlen(lien)-1;
  2112.                             while((*a) && (*a!='/') && ( a> lien)) a--;
  2113.                             if (*a=='/') {
  2114.                               *(a+1)='\0';
  2115.                             }
  2116.                           }
  2117.                           //char BIGSTK tempo[HTS_URLMAXSIZE*2];
  2118.                           //strcpybuff(tempo,"http://");
  2119.                           //strcatbuff(tempo,urladr);    // host
  2120.                           //if (*lien!='/')
  2121.                           //  strcatbuff(tempo,"/");
  2122.                           //strcatbuff(tempo,lien);
  2123.                           //strcpybuff(lien,tempo);
  2124.                         }
  2125.                       }
  2126.  
  2127.                       if (!error) {  // pas d'erreur?
  2128.                         if (p_type==2) {   // code ET PAS codebase      
  2129.                           char* a=lien+strlen(lien)-1;
  2130.                           char* start_of_filename = jump_identification(lien);
  2131.                           if (start_of_filename != NULL 
  2132.                             && (start_of_filename = strchr(start_of_filename, '/')) != NULL)
  2133.                             start_of_filename++;
  2134.                           if (start_of_filename == NULL)
  2135.                             strcatbuff(lien, "/");
  2136.                           while( (a > lien) && (*a) && (*a!='/')) a--;
  2137.                           if (*a=='/') {     // ok on a repΘrΘ le dernier /
  2138.                             if (start_of_filename != NULL && a >= start_of_filename) {
  2139.                               *(a+1)='\0';   // couper
  2140.                             }
  2141.                           } else {
  2142.                             *lien='\0';    // Θliminer
  2143.                             error=1;   // erreur, ne pas poursuivre
  2144.                           }      
  2145.                         }
  2146.  
  2147.                         // stocker base ou codebase?
  2148.                         switch(p_type) {
  2149.                       case 2: { 
  2150.                         //if (*lien!='/') strcatbuff(base,"/");
  2151.                         strcpybuff(base,lien);
  2152.                               }
  2153.                               break;      // base
  2154.                       case -2: {
  2155.                         //if (*lien!='/') strcatbuff(codebase,"/");
  2156.                         strcpybuff(codebase,lien); 
  2157.                                }
  2158.                                break;  // base
  2159.                         }
  2160.  
  2161.                         if ((opt->debug>1) && (opt->log!=NULL)) {
  2162.                           HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"code/codebase link %s base %s"LF,lien,base); test_flush;
  2163.                         }
  2164.                         //printf("base code: %s - %s\n",lien,base);
  2165.                       }
  2166.  
  2167.                     } else {
  2168.                       char* _base;
  2169.                       if (p_type==-1)   // code (applet)
  2170.                         _base=codebase;
  2171.                       else
  2172.                         _base=base;
  2173.  
  2174.  
  2175.                       // ajouter chemin de base href..
  2176.                       if (strnotempty(_base)) {       // considΘrer base
  2177.                         if (!link_has_authority(lien)) {    // non absolue
  2178.                           if (*lien!='/') {           // non absolu sur le site (/)
  2179.                             if ( ((int) strlen(_base)+(int) strlen(lien))<HTS_URLMAXSIZE) {
  2180.                               // mailto: and co: do NOT add base
  2181.                               if (ident_url_relatif(lien,urladr,urlfil,adr,fil)>=0) {
  2182.                                 char BIGSTK tempo[HTS_URLMAXSIZE*2];
  2183.                                 // base est absolue
  2184.                                 strcpybuff(tempo,_base);
  2185.                                 strcatbuff(tempo,lien + ((*lien=='/')?1:0) );
  2186.                                 strcpybuff(lien,tempo);        // patcher en considΘrant base
  2187.                                 // ** vΘrifier que ../ fonctionne (ne doit pas arriver mais bon..)
  2188.  
  2189.                                 if ((opt->debug>1) && (opt->log!=NULL)) {
  2190.                                   HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"link modified with code/codebase %s"LF,lien); test_flush;
  2191.                                 }
  2192.                               }
  2193.                             } else {
  2194.                               error=1;    // erreur
  2195.                               if (opt->log) {
  2196.                                 HTS_LOG(opt,LOG_ERROR); fprintf(opt->log,"Link %s too long with base href"LF,lien);
  2197.                                 test_flush;
  2198.                               }
  2199.                             }
  2200.                           } else {
  2201.                             char BIGSTK badr[HTS_URLMAXSIZE*2], bfil[HTS_URLMAXSIZE*2];
  2202.                             if (ident_url_absolute(_base, badr, bfil) >=0 ) {
  2203.                               if ( ((int) strlen(badr)+(int) strlen(lien)) < HTS_URLMAXSIZE) {
  2204.                                 char BIGSTK tempo[HTS_URLMAXSIZE*2];
  2205.                                 // base est absolue
  2206.                                 tempo[0] = '\0';
  2207.                                 if (!link_has_authority(badr)) {
  2208.                                   strcatbuff(tempo, "http://");
  2209.                                 }
  2210.                                 strcatbuff(tempo,badr);
  2211.                                 strcatbuff(tempo,lien);
  2212.                                 strcpybuff(lien,tempo);        // patcher en considΘrant base
  2213.  
  2214.                                 if ((opt->debug>1) && (opt->log!=NULL)) {
  2215.                                   HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"link modified with code/codebase %s"LF,lien); test_flush;
  2216.                                 }
  2217.                               } else {
  2218.                                 error=1;    // erreur
  2219.                                 if (opt->log) {
  2220.                                   HTS_LOG(opt,LOG_ERROR); fprintf(opt->log,"Link %s too long with base href"LF,lien);
  2221.                                   test_flush;
  2222.                                 }
  2223.                               }
  2224.                             }
  2225.                           }
  2226.                         }
  2227.                       }
  2228.  
  2229.  
  2230.                     }
  2231.                   }
  2232.  
  2233.                   // transformer lien quelconque (http, relatif, etc) en une adresse
  2234.                   // et un chemin+fichier (adr,fil)
  2235.                   if (!error) {
  2236.                     int reponse;
  2237.                     if ((opt->debug>1) && (opt->log!=NULL)) {
  2238.                       HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"build relative link %s with %s%s"LF,lien,relativeurladr,relativeurlfil); test_flush;
  2239.                     }
  2240.                     if ((reponse=ident_url_relatif(lien,relativeurladr,relativeurlfil,adr,fil))<0) {                        
  2241.                       adr[0]='\0';    // erreur
  2242.                       if (reponse==-2) {
  2243.                         if (opt->log) {
  2244.                           HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"Link %s not caught (unknown protocol)"LF,lien);
  2245.                           test_flush;
  2246.                         }
  2247.                       } else {
  2248.                         if ((opt->debug>1) && (opt->log!=NULL)) {
  2249.                           HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"ident_url_relatif failed for %s with %s%s"LF,lien,relativeurladr,relativeurlfil); test_flush;
  2250.                         }
  2251.                       }
  2252.                     } else {
  2253.                       if ((opt->debug>1) && (opt->log!=NULL)) {
  2254.                         HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"built relative link %s with %s%s -> %s%s"LF,lien,relativeurladr,relativeurlfil,adr,fil); test_flush;
  2255.                       }
  2256.                     }
  2257.                   } else {
  2258.                     if ((opt->debug>1) && (opt->log!=NULL)) {
  2259.                       HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"link %s not build, error detected before"LF,lien); test_flush;
  2260.                     }
  2261.                     adr[0]='\0';
  2262.                   }
  2263.  
  2264. #if HTS_CHECK_STRANGEDIR
  2265.                   // !ATTENTION!
  2266.                   // Ici on teste les exotiques du genre www.truc.fr/machin (sans slash α la fin)
  2267.                   // je n'ai pas encore trouvΘ le moyen de faire la diffΘrence entre un rΘpertoire
  2268.                   // et un fichier en http A PRIORI : je fais donc un test
  2269.                   // En cas de moved xxx, on recalcule adr et fil, tout simplement
  2270.                   // DEFAUT: test effectuΘ plusieurs fois! α revoir!!!
  2271.                   if ((adr[0]!='\0') && (strcmp(adr,"file://") && (p_type!=2) && (p_type!=-2)) {
  2272.                     //## if ((adr[0]!='\0') && (adr[0]!=lOCAL_CHAR) && (p_type!=2) && (p_type!=-2)) {
  2273.                     if (fil[strlen(fil)-1]!='/') {  // pas rΘpertoire
  2274.                       if (ishtml(opt,fil)==-2) {    // pas d'extension
  2275.                         char BIGSTK loc[HTS_URLMAXSIZE*2];  // Θventuelle nouvelle position
  2276.                         loc[0]='\0';
  2277.                         if ((opt->debug>1) && (opt->log!=NULL)) {
  2278.                           HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"link-check-directory: %s%s"LF,adr,fil);
  2279.                           test_flush;
  2280.                         }
  2281.  
  2282.                         // tester Θventuelle nouvelle position
  2283.                         switch (http_location(adr,fil,loc).statuscode) {
  2284.                       case 200: // ok au final
  2285.                         if (strnotempty(loc)) {  // a changΘ d'adresse
  2286.                           if (opt->log) {
  2287.                             HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"Link %s%s has moved to %s for %s%s"LF,adr,fil,loc,urladr,urlfil);
  2288.                             test_flush;
  2289.                           }
  2290.  
  2291.                           // recalculer adr et fil!
  2292.                           if (ident_url_absolute(loc,adr,fil)==-1) {
  2293.                             adr[0]='\0';  // cancel
  2294.                             if ((opt->debug>1) && (opt->log!=NULL)) {
  2295.                               HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"link-check-dir: %s%s"LF,adr,fil);
  2296.                               test_flush;
  2297.                             }
  2298.                           }
  2299.  
  2300.                         }
  2301.                         break;
  2302.                       case -2: case -3:  // timeout ou erreur grave
  2303.                         if (opt->log) {
  2304.                           HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"Connection too slow for testing link %s%s (from %s%s)"LF,adr,fil,urladr,urlfil);
  2305.                           test_flush;
  2306.                         }
  2307.  
  2308.                         break;
  2309.                         }
  2310.  
  2311.                       }
  2312.                     } 
  2313.                   }
  2314. #endif
  2315.  
  2316.                   // Le lien doit juste Ωtre rΘΘcrit, mais ne doit pas gΘnΘrer un lien
  2317.                   // exemple: <FORM ACTION="url_cgi">
  2318.                   if (p_nocatch) {
  2319.                     forbidden_url=1;    // interdire rΘcupΘration du lien
  2320.                     if ((opt->debug>1) && (opt->log!=NULL)) {
  2321.                       HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"link forced external at %s%s"LF,adr,fil);
  2322.                       test_flush;
  2323.                     }
  2324.                   }
  2325.  
  2326.                   // Tester si un lien doit Ωtre acceptΘ ou refusΘ (wizard)
  2327.                   // forbidden_url=1 : lien refusΘ
  2328.                   // forbidden_url=0 : lien acceptΘ
  2329.                   //if ((ptr>0) && (p_type!=2) && (p_type!=-2)) {    // tester autorisations?
  2330.                   if ((p_type!=2) && (p_type!=-2)) {    // tester autorisations?
  2331.                     if (!p_nocatch) {
  2332.                       if (adr[0]!='\0') {          
  2333.                         if ((opt->debug>1) && (opt->log!=NULL)) {
  2334.                           HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"wizard link test at %s%s.."LF,adr,fil);
  2335.                           test_flush;
  2336.                         }
  2337.                         forbidden_url=hts_acceptlink(opt,ptr,lien_tot,liens,
  2338.                           adr,fil,
  2339.                                                     intag_name ? intag_name : NULL, intag_name ? tag_attr_start : NULL,
  2340.                           &set_prio_to,
  2341.                           &just_test_it);
  2342.                         if ((opt->debug>1) && (opt->log!=NULL)) {
  2343.                           HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"result for wizard link test: %d"LF,forbidden_url);
  2344.                           test_flush;
  2345.                         }
  2346.                       }
  2347.                     }
  2348.                   }
  2349.  
  2350.                   // calculer meme_adresse
  2351.                   meme_adresse=strfield2(jump_identification(adr),jump_identification(urladr));
  2352.  
  2353.                   // DΘbut partie sauvegarde
  2354.  
  2355.                   // ici on forme le nom du fichier α sauver, et on patche l'URL
  2356.                   if (adr[0]!='\0') {
  2357.                     // savename: simplifier les ../ et autres joyeusetΘs
  2358.                     char BIGSTK save[HTS_URLMAXSIZE*2];
  2359.                     int r_sv=0;
  2360.                     // En cas de moved, adresse premiΦre
  2361.                     char BIGSTK former_adr[HTS_URLMAXSIZE*2];
  2362.                     char BIGSTK former_fil[HTS_URLMAXSIZE*2];
  2363.                     //
  2364.                     save[0]='\0'; former_adr[0]='\0'; former_fil[0]='\0';
  2365.                     //
  2366.  
  2367.                     // nom du chemin α sauver si on doit le calculer
  2368.                     // note: url_savename peut dΘcider de tester le lien si il le trouve
  2369.                     // suspect, et modifier alors adr et fil
  2370.                     // dans ce cas on aura une rΘfΘrence directe au lieu des traditionnels
  2371.                     // moved en cascade (impossible α reproduire α priori en local, lorsque des fichiers
  2372.                     // gif sont impliquΘs par exemple)
  2373.                     if ((p_type!=2) && (p_type!=-2)) {  // pas base href ou codebase
  2374.                       if (forbidden_url!=1) {
  2375.                         char BIGSTK last_adr[HTS_URLMAXSIZE*2];
  2376.  
  2377.                         /* Calc */
  2378.                         last_adr[0]='\0';
  2379.                         //char last_fil[HTS_URLMAXSIZE*2]="";
  2380.                         strcpybuff(last_adr,adr);    // ancienne adresse
  2381.                         //strcpybuff(last_fil,fil);    // ancien chemin
  2382.                         r_sv=url_savename(adr,fil,save,former_adr,former_fil,liens[ptr]->adr,liens[ptr]->fil,opt,liens,lien_tot,sback,cache,hash,ptr,numero_passe,NULL);
  2383.                         if (strcmp(jump_identification(last_adr),jump_identification(adr)) != 0) {  // a changΘ
  2384.  
  2385.                           // 2e test si moved
  2386.  
  2387.                           // Tester si un lien doit Ωtre acceptΘ ou refusΘ (wizard)
  2388.                           // forbidden_url=1 : lien refusΘ
  2389.                           // forbidden_url=0 : lien acceptΘ
  2390.                           if ((ptr>0) && (p_type!=2) && (p_type!=-2)) {    // tester autorisations?
  2391.                             if (!p_nocatch) {
  2392.                               if (adr[0]!='\0') {          
  2393.                                 if ((opt->debug>1) && (opt->log!=NULL)) {
  2394.                                   HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"wizard moved link retest at %s%s.."LF,adr,fil);
  2395.                                   test_flush;
  2396.                                 }
  2397.                                 forbidden_url=hts_acceptlink(opt,ptr,lien_tot,liens,
  2398.                                   adr,fil,
  2399.                                   intag_name ? intag_name : NULL, intag_name ? tag_attr_start : NULL,
  2400.                                   &set_prio_to,
  2401.                                   &just_test_it);
  2402.                                 if ((opt->debug>1) && (opt->log!=NULL)) {
  2403.                                   HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"result for wizard moved link retest: %d"LF,forbidden_url);
  2404.                                   test_flush;
  2405.                                 }
  2406.                               }
  2407.                             }
  2408.                           }
  2409.  
  2410.                           //import_done=1;    // c'est un import!
  2411.                           meme_adresse=0;   // on a changΘ
  2412.                         }
  2413.                       } else {
  2414.                         strcpybuff(save,"");  // dummy
  2415.                       }
  2416.                     }
  2417.  
  2418.                     // resolve unresolved type
  2419.                     if (r_sv!=-1
  2420.                       && p_type != 2 && p_type != -2
  2421.                       && forbidden_url == 0
  2422.                       && IS_DELAYED_EXT(save)
  2423.                       )
  2424.                     {
  2425.                       time_t t;
  2426.                       
  2427.                       // pas d'erreur, on continue
  2428.                       r_sv = hts_wait_delayed(str, adr, fil, save, parenturladr, parenturlfil, former_adr, former_fil, &forbidden_url);
  2429.  
  2430.                       /* User interaction, because hts_wait_delayed can be slow.. (3.43) */
  2431.                       t = time(NULL);
  2432.                       if (user_interact_timestamp == 0 || t - user_interact_timestamp > 0) {
  2433.                         user_interact_timestamp = t;
  2434.                         ENGINE_SAVE_CONTEXT();
  2435.                         {
  2436.                           hts_mirror_process_user_interaction(str, stre);
  2437.                         }
  2438.                         ENGINE_SET_CONTEXT();
  2439.                       }
  2440.                     }
  2441.  
  2442.                     // record!
  2443.                     if (r_sv!=-1) {  // pas d'erreur, on continue
  2444.                       /* log */
  2445.                       if ((opt->debug>1) && (opt->log!=NULL)) {
  2446.                         HTS_LOG(opt,LOG_DEBUG);
  2447.                         if (forbidden_url!=1) {    // le lien va Ωtre chargΘ
  2448.                           if ((p_type==2) || (p_type==-2)) {  // base href ou codebase, pas un lien
  2449.                             fprintf(opt->log,"Code/Codebase: %s%s"LF,adr,fil);
  2450.                           } else if ((opt->getmode & 4)==0) {
  2451.                             fprintf(opt->log,"Record: %s%s -> %s"LF,adr,fil,save);
  2452.                           } else {
  2453.                             if (!ishtml(opt,fil))
  2454.                               fprintf(opt->log,"Record after: %s%s -> %s"LF,adr,fil,save);
  2455.                             else
  2456.                               fprintf(opt->log,"Record: %s%s -> %s"LF,adr,fil,save);
  2457.                           } 
  2458.                         } else
  2459.                           fprintf(opt->log,"External: %s%s"LF,adr,fil);
  2460.                         test_flush;
  2461.                       }
  2462.                       /* FIN log */
  2463.  
  2464.                       // Θcrire lien
  2465.                       if ((p_type==2) || (p_type==-2)) {  // base href ou codebase, sauter
  2466.                         lastsaved=eadr-1+1;  // sauter "
  2467.                       }
  2468.                       /* */
  2469.                       else if (opt->urlmode==0) {    // URL absolue dans tous les cas
  2470.                         if ((opt->getmode & 1) && (ptr>0)) {    // ecrire les html
  2471.                           if (!link_has_authority(adr)) {
  2472.                             HT_ADD("http://");
  2473.                           } else {
  2474.                             char* aut = strstr(adr, "//");
  2475.                             if (aut) {
  2476.                               char tmp[256];
  2477.                               tmp[0]='\0';
  2478.                               strncatbuff(tmp, adr, (int) (aut - adr));   // scheme
  2479.                               HT_ADD(tmp);          // Protocol
  2480.                               HT_ADD("//");
  2481.                             }
  2482.                           }
  2483.  
  2484.                           if (!opt->passprivacy) {
  2485.                             HT_ADD_HTMLESCAPED(jump_protocol(adr));           // Password
  2486.                           } else {
  2487.                             HT_ADD_HTMLESCAPED(jump_identification(adr));     // No Password
  2488.                           }
  2489.                           if (*fil!='/')
  2490.                             HT_ADD("/");
  2491.                           HT_ADD_HTMLESCAPED(fil);
  2492.                         }
  2493.                         lastsaved=eadr-1;    // dernier Θcrit+1 (enfin euh apres on fait un ++ alors hein)
  2494.                         /* */
  2495.                       } else if (opt->urlmode >= 4) {    // ne rien faire dans tous les cas!
  2496.                         /* */
  2497.                         /* leave the link 'as is' */
  2498.                         /* Sinon, dΘpend de interne/externe */
  2499.                       } else if (forbidden_url==1) {    // le lien ne sera pas chargΘ, rΘfΘrence externe!
  2500.                         if ((opt->getmode & 1) && (ptr>0)) {
  2501.                           if (p_type!=-1) {     // pas que le nom de fichier (pas classe java)
  2502.                             if (!opt->external) {
  2503.                               if (!link_has_authority(adr)) {
  2504.                                 HT_ADD("http://");
  2505.                                 if (!opt->passprivacy) {
  2506.                                   HT_ADD_HTMLESCAPED(adr);     // Password
  2507.                                 } else {
  2508.                                   HT_ADD_HTMLESCAPED(jump_identification(adr));     // No Password
  2509.                                 }
  2510.                                 if (*fil!='/')
  2511.                                   HT_ADD("/");
  2512.                                 HT_ADD_HTMLESCAPED(fil);
  2513.                               } else {
  2514.                                 char* aut = strstr(adr, "//");
  2515.                                 if (aut) {
  2516.                                   char tmp[256];
  2517.                                   tmp[0]='\0';
  2518.                                   strncatbuff(tmp, adr, (int) (aut - adr));   // scheme
  2519.                                   HT_ADD(tmp);          // Protocol
  2520.                                   HT_ADD("//");
  2521.                                   if (!opt->passprivacy) {
  2522.                                     HT_ADD_HTMLESCAPED(jump_protocol(adr));          // Password
  2523.                                   } else {
  2524.                                     HT_ADD_HTMLESCAPED(jump_identification(adr));     // No Password
  2525.                                   }
  2526.                                   if (*fil!='/')
  2527.                                     HT_ADD("/");
  2528.                                   HT_ADD_HTMLESCAPED(fil);
  2529.                                 }
  2530.                               }
  2531.                               //
  2532.                             } else {    // fichier/page externe, mais on veut gΘnΘrer une erreur
  2533.                               //
  2534.                               int patch_it=0;
  2535.                               int add_url=0;
  2536.                               char* cat_name=NULL;
  2537.                               char* cat_data=NULL;
  2538.                               int cat_nb=0;
  2539.                               int cat_data_len=0;
  2540.  
  2541.                               // ajouter lien external
  2542.                               switch ( (link_has_authority(adr)) ? 1 : ( (fil[strlen(fil)-1]=='/')?1:(ishtml(opt,fil))  ) ) {
  2543.                             case 1: case -2:       // html ou rΘpertoire
  2544.                               if (opt->getmode & 1) {  // sauver html
  2545.                                 patch_it=1;   // redirect
  2546.                                 add_url=1;    // avec link?
  2547.                                 cat_name="external.html";
  2548.                                 cat_nb=0;
  2549.                                 cat_data=HTS_DATA_UNKNOWN_HTML;
  2550.                                 cat_data_len=HTS_DATA_UNKNOWN_HTML_LEN;
  2551.                               }
  2552.                               break;
  2553.                             default:    // inconnu
  2554.                               // asp, cgi..
  2555.                               if ( (strfield2(fil+max(0,(int)strlen(fil)-4),".gif")) 
  2556.                                 || (strfield2(fil+max(0,(int)strlen(fil)-4),".jpg")) 
  2557.                                 || (strfield2(fil+max(0,(int)strlen(fil)-4),".xbm")) 
  2558.                                 /*|| (ishtml(opt,fil)!=0)*/ ) {
  2559.                                 patch_it=1;   // redirect
  2560.                               add_url=1;    // avec link aussi
  2561.                               cat_name="external.gif";
  2562.                               cat_nb=1;
  2563.                               cat_data=HTS_DATA_UNKNOWN_GIF;
  2564.                               cat_data_len=HTS_DATA_UNKNOWN_GIF_LEN;
  2565.                                 } else /* if (is_dyntype(get_ext(fil))) */ {
  2566.                                   patch_it=1;   // redirect
  2567.                                   add_url=1;    // avec link?
  2568.                                   cat_name="external.html";
  2569.                                   cat_nb=0;
  2570.                                   cat_data=HTS_DATA_UNKNOWN_HTML;
  2571.                                   cat_data_len=HTS_DATA_UNKNOWN_HTML_LEN;
  2572.                                 }
  2573.                                 break;
  2574.                               }// html,gif
  2575.  
  2576.                               if (patch_it) {
  2577.                                 char BIGSTK save[HTS_URLMAXSIZE*2];
  2578.                                 char BIGSTK tempo[HTS_URLMAXSIZE*2];
  2579.                                 strcpybuff(save,StringBuff(opt->path_html));
  2580.                                 strcatbuff(save,cat_name);
  2581.                                 if (lienrelatif(tempo,save, relativesavename)==0) {
  2582.                                                                     /* Never escape high-chars (we don't know the encoding!!) */
  2583.                                   escape_uri_utf(tempo);     // escape with %xx
  2584.                                   //if (!no_esc_utf)
  2585.                                   //  escape_uri(tempo);     // escape with %xx
  2586.                                   //else
  2587.                                   //  escape_uri_utf(tempo);     // escape with %xx
  2588.                                   HT_ADD_HTMLESCAPED(tempo);    // page externe
  2589.                                   if (add_url) {
  2590.                                     HT_ADD("?link=");    // page externe
  2591.  
  2592.                                     // same as above
  2593.                                     if (!link_has_authority(adr)) {
  2594.                                       HT_ADD("http://");
  2595.                                       if (!opt->passprivacy) {
  2596.                                         HT_ADD_HTMLESCAPED(adr);     // Password
  2597.                                       } else {
  2598.                                         HT_ADD_HTMLESCAPED(jump_identification(adr));     // No Password
  2599.                                       }
  2600.                                       if (*fil!='/')
  2601.                                         HT_ADD("/");
  2602.                                       HT_ADD_HTMLESCAPED(fil);
  2603.                                     } else {
  2604.                                       char* aut = strstr(adr, "//");
  2605.                                       if (aut) {
  2606.                                         char tmp[256];
  2607.                                         tmp[0]='\0';
  2608.                                         strncatbuff(tmp, adr, (int) (aut - adr) + 2);   // scheme
  2609.                                         HT_ADD(tmp);
  2610.                                         if (!opt->passprivacy) {
  2611.                                           HT_ADD_HTMLESCAPED(jump_protocol(adr));          // Password
  2612.                                         } else {
  2613.                                           HT_ADD_HTMLESCAPED(jump_identification(adr));     // No Password
  2614.                                         }
  2615.                                         if (*fil!='/')
  2616.                                           HT_ADD("/");
  2617.                                         HT_ADD_HTMLESCAPED(fil);
  2618.                                       }
  2619.                                     }
  2620.                                     //
  2621.  
  2622.                                   }
  2623.                                 }
  2624.  
  2625.                                 // Θcrire fichier?
  2626.                                 if (verif_external(opt,cat_nb,1)) {
  2627.                                   //if (!fexist(fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_html),cat_name))) {
  2628.                                   FILE* fp = filecreate(&opt->state.strc, fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_html),cat_name));
  2629.                                   if (fp) {
  2630.                                     if (cat_data_len==0) {   // texte
  2631.                                       verif_backblue(opt,StringBuff(opt->path_html));
  2632.                                       fprintf(fp,"%s%s","<!-- Created by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->"LF,cat_data);
  2633.                                     } else {                    // data
  2634.                                       fwrite(cat_data,cat_data_len,1,fp);
  2635.                                     }
  2636.                                     fclose(fp);
  2637.                                     usercommand(opt,0,NULL,fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_html),cat_name),"","");
  2638.                                   }
  2639.                                 }
  2640.                               }  else {    // Θcrire normalement le nom de fichier
  2641.                                 HT_ADD("http://");
  2642.                                 if (!opt->passprivacy) {
  2643.                                   HT_ADD_HTMLESCAPED(adr);       // Password
  2644.                                 } else {
  2645.                                   HT_ADD_HTMLESCAPED(jump_identification(adr));       // No Password
  2646.                                 }
  2647.                                 if (*fil!='/')
  2648.                                   HT_ADD("/");
  2649.                                 HT_ADD_HTMLESCAPED(fil);
  2650.                               }// patcher?
  2651.                             }  // external
  2652.                           } else {  // que le nom de fichier (classe java)
  2653.                             // en gros recopie de plus bas: copier codebase et base
  2654.                             if (p_flush) {
  2655.                               char BIGSTK tempo[HTS_URLMAXSIZE*2];    // <-- ajoutΘ
  2656.                               char BIGSTK tempo_pat[HTS_URLMAXSIZE*2];
  2657.  
  2658.                               // Calculer chemin
  2659.                               tempo_pat[0]='\0';
  2660.                               strcpybuff(tempo,fil);  // <-- ajoutΘ
  2661.                               {
  2662.                                 char* a=strrchr(tempo,'/');
  2663.  
  2664.                                 // Example: we converted code="x.y.z.foo.class" into "x/y/z/foo.class"
  2665.                                 // we have to do the contrary now
  2666.                                 if (add_class_dots_to_patch>0) {
  2667.                                   while( (add_class_dots_to_patch>0) && (a) ) {
  2668.                                     *a='.';     // convert "false" java / into .
  2669.                                     add_class_dots_to_patch--;
  2670.                                     a=strrchr(tempo,'/');
  2671.                                   }
  2672.                                   // if add_class_dots_to_patch, this is because there is a problem!!
  2673.                                   if (add_class_dots_to_patch) {
  2674.                                     if (opt->log) {
  2675.                                       HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"Error: can not rewind java path %s, check html code"LF,tempo);
  2676.                                       test_flush;
  2677.                                     }
  2678.                                   }
  2679.                                 }
  2680.  
  2681.                                 // Cut path/filename
  2682.                                 if (a) {
  2683.                                   char BIGSTK tempo2[HTS_URLMAXSIZE*2];
  2684.                                   strcpybuff(tempo2,a+1);         // FICHIER
  2685.                                   strncatbuff(tempo_pat,tempo,(int) (a - tempo)+1);  // chemin
  2686.                                   strcpybuff(tempo,tempo2);                     // fichier
  2687.                                 }
  2688.                               }
  2689.  
  2690.                               // Θrire codebase="chemin"
  2691.                               if ((opt->getmode & 1) && (ptr>0)) {
  2692.                                 char BIGSTK tempo4[HTS_URLMAXSIZE*2];
  2693.                                 tempo4[0]='\0';
  2694.  
  2695.                                 if (strnotempty(tempo_pat)) {
  2696.                                   HT_ADD("codebase=\"http://");
  2697.                                   if (!opt->passprivacy) {
  2698.                                     HT_ADD_HTMLESCAPED(adr);  // Password
  2699.                                   } else {
  2700.                                     HT_ADD_HTMLESCAPED(jump_identification(adr));  // No Password
  2701.                                   }
  2702.                                   if (*tempo_pat!='/') HT_ADD("/");
  2703.                                   HT_ADD(tempo_pat);
  2704.                                   HT_ADD("\" ");
  2705.                                 }
  2706.  
  2707.                                 strncatbuff(tempo4,lastsaved,(int) (p_flush - lastsaved));
  2708.                                 HT_ADD(tempo4);    // refresh code="
  2709.                                 HT_ADD(tempo);
  2710.                               }
  2711.                             }
  2712.                           }
  2713.                         }
  2714.                         lastsaved=eadr-1;
  2715.                       }
  2716.                       /*
  2717.                       else if (opt->urlmode==1) {    // ABSOLU, c'est le cas le moins courant
  2718.                       //  NE FONCTIONNE PAS!!  (et est inutile)
  2719.                       if ((opt->getmode & 1) && (ptr>0)) {    // ecrire les html
  2720.                       // Θcrire le lien modifiΘ, absolu
  2721.                       HT_ADD("file:");
  2722.                       if (*save=='/')
  2723.                       HT_ADD(save+1)
  2724.                       else
  2725.                       HT_ADD(save)
  2726.                       }
  2727.                       lastsaved=eadr-1;    // dernier Θcrit+1 (enfin euh apres on fait un ++ alors hein)
  2728.                       }
  2729.                       */
  2730.                       else if (opt->mimehtml) {
  2731.                         char BIGSTK buff[HTS_URLMAXSIZE*3];
  2732.                         HT_ADD("cid:");
  2733.                         strcpybuff(buff, adr);
  2734.                         strcatbuff(buff, fil);
  2735.                         escape_in_url(buff);
  2736.                         { char* a = buff; while((a = strchr(a, '%'))) { *a = 'X'; a++; } }
  2737.                         HT_ADD_HTMLESCAPED(buff);
  2738.                         lastsaved=eadr-1;    // dernier Θcrit+1 (enfin euh apres on fait un ++ alors hein)
  2739.                       }
  2740.                       else if (opt->urlmode==3) {    // URI absolue /
  2741.                         if ((opt->getmode & 1) && (ptr>0)) {    // ecrire les html
  2742.                           HT_ADD_HTMLESCAPED(fil);
  2743.                         }
  2744.                         lastsaved=eadr-1;    // dernier Θcrit+1 (enfin euh apres on fait un ++ alors hein)
  2745.                       }
  2746.                       else if (opt->urlmode==2) {  // RELATIF
  2747.                         char BIGSTK tempo[HTS_URLMAXSIZE*2];
  2748.                         tempo[0]='\0';
  2749.                         // calculer le lien relatif
  2750.  
  2751.                         if (lienrelatif(tempo,save,relativesavename)==0) {
  2752.                           if (!in_media) {    // In media (such as real audio): don't patch
  2753.                                                         /* Never escape high-chars (we don't know the encoding!!) */
  2754.                                                         escape_uri_utf(tempo);
  2755.                             //if (!no_esc_utf)
  2756.                             //  escape_uri(tempo);     // escape with %xx
  2757.                             //else {
  2758.                             //  /* No escaping at all - remaining upper chars will be escaped below */
  2759.                             //  /* FIXME - Should be done in all local cases */
  2760.                             //  //x_escape_html(tempo);
  2761.                             //  //escape_uri_utf(tempo);     // FIXME - escape with %xx
  2762.                             //  //escape_uri(tempo);     // escape with %xx
  2763.                             //}
  2764.                           }
  2765.                           if ((opt->debug>1) && (opt->log!=NULL)) {
  2766.                             HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"relative link at %s build with %s and %s: %s"LF,adr,save,relativesavename,tempo);
  2767.                             test_flush;
  2768.                           }
  2769.  
  2770.                           // lien applet (code) - il faut placer un codebase avant
  2771.                           if (p_type==-1) {  // que le nom de fichier
  2772.  
  2773.                             if (p_flush) {
  2774.                               char BIGSTK tempo_pat[HTS_URLMAXSIZE*2];
  2775.                               tempo_pat[0]='\0';
  2776.                               {
  2777.                                 char* a=strrchr(tempo,'/');
  2778.  
  2779.                                 // Example: we converted code="x.y.z.foo.class" into "x/y/z/foo.class"
  2780.                                 // we have to do the contrary now
  2781.                                 if (add_class_dots_to_patch>0) {
  2782.                                   while( (add_class_dots_to_patch>0) && (a) ) {
  2783.                                     *a='.';     // convert "false" java / into .
  2784.                                     add_class_dots_to_patch--;
  2785.                                     a=strrchr(tempo,'/');
  2786.                                   }
  2787.                                   // if add_class_dots_to_patch, this is because there is a problem!!
  2788.                                   if (add_class_dots_to_patch) {
  2789.                                     if (opt->log) {
  2790.                                       HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"Error: can not rewind java path %s, check html code"LF,tempo);
  2791.                                       test_flush;
  2792.                                     }
  2793.                                   }
  2794.                                 }
  2795.  
  2796.                                 if (a) {
  2797.                                   char BIGSTK tempo2[HTS_URLMAXSIZE*2];
  2798.                                   strcpybuff(tempo2,a+1);
  2799.                                   strncatbuff(tempo_pat,tempo,(int) (a - tempo)+1);  // chemin
  2800.                                   strcpybuff(tempo,tempo2);                     // fichier
  2801.                                 }
  2802.                               }
  2803.  
  2804.                               // Θrire codebase="chemin"
  2805.                               if ((opt->getmode & 1) && (ptr>0)) {
  2806.                                 char BIGSTK tempo4[HTS_URLMAXSIZE*2];
  2807.                                 tempo4[0]='\0';
  2808.  
  2809.                                 if (strnotempty(tempo_pat)) {
  2810.                                   HT_ADD("codebase=\"");
  2811.                                   HT_ADD_HTMLESCAPED(tempo_pat);
  2812.                                   HT_ADD("\" ");
  2813.                                 }
  2814.  
  2815.                                 strncatbuff(tempo4,lastsaved,(int) (p_flush - lastsaved));
  2816.                                 HT_ADD(tempo4);    // refresh code="
  2817.                               }
  2818.                             }
  2819.                             //lastsaved=adr;    // dernier Θcrit+1
  2820.                           }                              
  2821.  
  2822.                           if ((opt->getmode & 1) && (ptr>0)) {
  2823.                             // Θcrire le lien modifiΘ, relatif
  2824.                             // Note: escape all chars, even >127 (no UTF)
  2825.                             HT_ADD_HTMLESCAPED_FULL(tempo);
  2826.  
  2827.                             // Add query-string, for informational purpose only
  2828.                             // Useless, because all parameters-pages are saved into different targets
  2829.                             if (opt->includequery) {
  2830.                               char* a=strchr(lien,'?');
  2831.                               if (a) {
  2832.                                 HT_ADD_HTMLESCAPED(a);
  2833.                               }
  2834.                             }
  2835.                           }
  2836.                           lastsaved=eadr-1;    // dernier Θcrit+1 (enfin euh apres on fait un ++ alors hein)
  2837.                         } else {
  2838.                           if (opt->log) {
  2839.                             fprintf(opt->log,"Error building relative link %s and %s"LF,save,relativesavename);
  2840.                             test_flush;
  2841.                           }
  2842.                         }
  2843.                       }  // sinon le lien sera Θcrit normalement
  2844.  
  2845.  
  2846. #if 0
  2847.                       if (fexist(save)) {    // le fichier existe..
  2848.                         adr[0]='\0';
  2849.                         //if ((opt->debug>0) && (opt->log!=NULL)) {
  2850.                         if (opt->log) {
  2851.                           HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"Link has already been written on disk, cancelled: %s"LF,save);
  2852.                           test_flush;
  2853.                         }
  2854.                       }
  2855. #endif                            
  2856.  
  2857.                       /* Security check */
  2858.                       if (strlen(save) >= HTS_URLMAXSIZE) {
  2859.                         adr[0]='\0';
  2860.                         if (opt->log) {
  2861.                           HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"Link is too long: %s"LF,save);
  2862.                           test_flush;
  2863.                         }
  2864.                       }
  2865.  
  2866.                       if ((adr[0]!='\0') && (p_type!=2) && (p_type!=-2) && (forbidden_url!=1) ) {  // si le fichier n'existe pas, ajouter α la liste                            
  2867.                         // n'y a-t-il pas trop de liens?
  2868.                         if (lien_tot+1 >= lien_max-4) {    // trop de liens!
  2869.                           printf("PANIC! : Too many URLs : >%d [%d]\n",lien_tot,__LINE__);
  2870.                           if (opt->log) {
  2871.                             fprintf(opt->log,LF"Too many URLs, giving up..(>%d)"LF,lien_max);
  2872.                             fprintf(opt->log,"To avoid that: use #L option for more links (example: -#L1000000)"LF);
  2873.                             test_flush;
  2874.                           }
  2875.                           if ((opt->getmode & 1) && (ptr>0)) { if (fp) { fclose(fp); fp=NULL; } }
  2876.                           XH_uninit;   // dΘsallocation mΘmoire & buffers
  2877.                           return -1;
  2878.  
  2879.                         } else {    // noter le lien sur la listes des liens α charger
  2880.                           int pass_fix,dejafait=0;
  2881.  
  2882.                           // Calculer la prioritΘ de ce lien
  2883.                           if ((opt->getmode & 4)==0) {    // traiter html aprΦs
  2884.                             pass_fix=0;
  2885.                           } else {    // vΘrifier que ce n'est pas un !html
  2886.                             if (!ishtml(opt,fil))
  2887.                               pass_fix=1;        // prioritΘ infΘrieure (traiter aprΦs)
  2888.                             else
  2889.                               pass_fix=max(0,numero_passe);    // prioritΘ normale
  2890.                           }
  2891.  
  2892.                           /* If the file seems to be an html file, get depth-1 */
  2893.                           /*
  2894.                           if (strnotempty(save)) {
  2895.                           if (ishtml(opt,save) == 1) {
  2896.                           // descore_prio = 2;
  2897.                           } else {
  2898.                           // descore_prio = 1;
  2899.                           }
  2900.                           }
  2901.                           */
  2902.  
  2903.                           // vΘrifier que le lien n'a pas dΘja ΘtΘ notΘ
  2904.                           // si c'est le cas, alors il faut s'assurer que la prioritΘ associΘe
  2905.                           // au fichier est la plus grande des deux prioritΘs
  2906.                           //
  2907.                           // On part de la fin et on essaye de se presser (Θconomise temps machine)
  2908.                           {
  2909.                             int i=hash_read(hash,save,"",0,opt->urlhack);      // lecture type 0 (sav)
  2910.                             if (i>=0) {
  2911.                               if ((opt->debug>1) && (opt->log!=NULL)) {
  2912.                                 if (
  2913.                                   strcmp(adr, liens[i]->adr) != 0 
  2914.                                   || strcmp(fil, liens[i]->fil) != 0
  2915.                                   ) {
  2916.                                     HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"merging similar links %s%s and %s%s"LF,adr,fil,liens[i]->adr,liens[i]->fil);
  2917.                                     test_flush;
  2918.                                   }
  2919.                               }
  2920.                               liens[i]->depth=maximum(liens[i]->depth,liens[ptr]->depth - 1);
  2921.                               dejafait=1;
  2922.                             }
  2923.                           }
  2924.  
  2925.                           // le lien n'a jamais ΘtΘ crΘΘ.
  2926.                           // cette fois ci, on le crΘe!
  2927.                           if (!dejafait) {                                
  2928.                             //
  2929.                             // >>>> CREER LE LIEN <<<<
  2930.                             //
  2931.                             // enregistrer lien α charger
  2932.                             //liens[lien_tot]->adr[0]=liens[lien_tot]->fil[0]=liens[lien_tot]->sav[0]='\0';
  2933.                             // mΩme adresse: l'objet pΦre est l'objet pΦre de l'actuel
  2934.  
  2935.                                                         // DEBUT ROBOTS.TXT AJOUT
  2936.                                                         if (!just_test_it) {
  2937.                                                             if ((!strfield(adr,"ftp://"))         // non ftp
  2938.                                                                 && (!strfield(adr,"file://")) 
  2939. #if HTS_USEMMS
  2940.                                                                 && (!strfield(adr,"mms://")) 
  2941. #endif
  2942.                                                                 ) 
  2943.                                                             {    // non file
  2944.                                                                 if (opt->robots) {    // rΘcupΘrer robots
  2945.                                                                     if (ishtml(opt,fil)!=0) {                       // pas la peine pour des fichiers isolΘs
  2946.                                                                         if (checkrobots(_ROBOTS,adr,"") != -1) {    // robots.txt ?
  2947.                                                                             checkrobots_set(_ROBOTS ,adr,"");          // ajouter entrΘe vide
  2948.                                                                             if (checkrobots(_ROBOTS,adr,"") == -1) {    // robots.txt ?
  2949.                                                                                 // enregistrer robots.txt (MACRO)
  2950.                                                                                 liens_record(adr,"/robots.txt","","","");
  2951.                                                                                 if (liens[lien_tot]==NULL) {  // erreur, pas de place rΘservΘe
  2952.                                                                                     printf("PANIC! : Not enough memory [%d]\n",__LINE__);
  2953.                                                                                     if (opt->log) { 
  2954.                                                                                         fprintf(opt->log,"Not enough memory, can not re-allocate %d bytes"LF,(int)((add_tab_alloc+1)*sizeof(lien_url)));
  2955.                                                                                         test_flush;
  2956.                                                                                     }
  2957.                                                                                     if ((opt->getmode & 1) && (ptr>0)) { if (fp) { fclose(fp); fp=NULL; } }
  2958.                                                                                     XH_uninit;    // dΘsallocation mΘmoire & buffers
  2959.                                                                                     return -1;
  2960.                                                                                 }  
  2961.                                                                                 liens[lien_tot]->testmode=0;          // pas mode test
  2962.                                                                                 liens[lien_tot]->link_import=0;       // pas mode import     
  2963.                                                                                 liens[lien_tot]->premier=lien_tot;
  2964.                                                                                 liens[lien_tot]->precedent=ptr;
  2965.                                                                                 liens[lien_tot]->depth=0;
  2966.                                                                                 liens[lien_tot]->pass2=max(0,numero_passe);
  2967.                                                                                 liens[lien_tot]->retry=0;
  2968.                                                                                 lien_tot++;  // UN LIEN DE PLUS
  2969. #if DEBUG_ROBOTS
  2970.                                                                                 printf("robots.txt: added file robots.txt for %s\n",adr);
  2971. #endif
  2972.                                                                                 if ((opt->debug>1) && (opt->log!=NULL)) {
  2973.                                                                                     HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"robots.txt added at %s"LF,adr);
  2974.                                                                                     test_flush;
  2975.                                                                                 }
  2976.                                                                             } else {
  2977.                                                                                 if (opt->log) {   
  2978.                                                                                     fprintf(opt->log,"Unexpected robots.txt error at %d"LF,__LINE__);
  2979.                                                                                     test_flush;
  2980.                                                                                 }
  2981.                                                                             }
  2982.                                                                         }
  2983.                                                                     }
  2984.                                                                 }
  2985.                                                             }
  2986.                                                         }
  2987.                                                         // FIN ROBOTS.TXT AJOUT
  2988.  
  2989.                                                         // enregistrer (MACRO)
  2990.                                                         liens_record(adr,fil,save,former_adr,former_fil);
  2991.                                                         if (liens[lien_tot]==NULL) {  // erreur, pas de place rΘservΘe
  2992.                                                             printf("PANIC! : Not enough memory [%d]\n",__LINE__);
  2993.                                                             if (opt->log) { 
  2994.                                                                 fprintf(opt->log,"Not enough memory, can not re-allocate %d bytes"LF,(int)((add_tab_alloc+1)*sizeof(lien_url)));
  2995.                                                                 test_flush;
  2996.                                                             }
  2997.                               if ((opt->getmode & 1) && (ptr>0)) { if (fp) { fclose(fp); fp=NULL; } }
  2998.                               XH_uninit;    // dΘsallocation mΘmoire & buffers
  2999.                               return -1;
  3000.                             }  
  3001.  
  3002.                             // mode test?
  3003.                             if (!just_test_it)
  3004.                               liens[lien_tot]->testmode=0;          // pas mode test
  3005.                             else
  3006.                               liens[lien_tot]->testmode=1;          // mode test
  3007.                             if (!import_done)
  3008.                               liens[lien_tot]->link_import=0;       // pas mode import
  3009.                             else
  3010.                               liens[lien_tot]->link_import=1;       // mode import
  3011.                             // Θcrire autres paramΦtres de la structure-lien
  3012.                             if ((meme_adresse) && (!import_done) && (liens[ptr]->premier != 0))
  3013.                               liens[lien_tot]->premier=liens[ptr]->premier;
  3014.                             else    // sinon l'objet pΦre est le prΘcΘdent lui mΩme
  3015.                               liens[lien_tot]->premier=lien_tot;
  3016.                             // liens[lien_tot]->premier=ptr;
  3017.  
  3018.                             liens[lien_tot]->precedent=ptr;
  3019.                             // noter la prioritΘ
  3020.                             if (!set_prio_to)
  3021.                               liens[lien_tot]->depth=liens[ptr]->depth - 1;
  3022.                             else
  3023.                               liens[lien_tot]->depth=max(0,min(liens[ptr]->depth-1,set_prio_to-1));         // PRIORITE NULLE (catch page)
  3024.                             // noter pass
  3025.                             liens[lien_tot]->pass2=pass_fix;
  3026.                             liens[lien_tot]->retry=opt->retry;
  3027.  
  3028.                             //strcpybuff(liens[lien_tot]->adr,adr);
  3029.                             //strcpybuff(liens[lien_tot]->fil,fil);
  3030.                             //strcpybuff(liens[lien_tot]->sav,save); 
  3031.                             if ((opt->debug>1) && (opt->log!=NULL)) {
  3032.                               if (!just_test_it) {
  3033.                                 HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"OK, NOTE: %s%s -> %s"LF,liens[lien_tot]->adr,liens[lien_tot]->fil,liens[lien_tot]->sav);
  3034.                               } else {
  3035.                                 HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"OK, TEST: %s%s"LF,liens[lien_tot]->adr,liens[lien_tot]->fil);
  3036.                               }
  3037.                               test_flush;
  3038.                             }
  3039.  
  3040.                             lien_tot++;  // UN LIEN DE PLUS
  3041.                           } else { // if !dejafait
  3042.                             if ((opt->debug>1) && (opt->log!=NULL)) {
  3043.                               HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"link has already been recorded, cancelled: %s"LF,save);
  3044.                               test_flush;
  3045.                             }
  3046.  
  3047.                           }
  3048.  
  3049.  
  3050.                         }   // si pas trop de liens
  3051.                       }   // si adr[0]!='\0'
  3052.  
  3053.  
  3054.                     }  // if adr[0]!='\0' 
  3055.  
  3056.                   }  // if adr[0]!='\0'
  3057.  
  3058.                 }    // if strlen(lien)>0
  3059.  
  3060.               }   // if ok==0      
  3061.  
  3062.               assertf(eadr - adr >= 0);       // Should not go back
  3063.               if (eadr > adr) {
  3064.                 INCREMENT_CURRENT_ADR(eadr - 1 - adr);
  3065.               }
  3066.               // adr=eadr-1;  // ** sauter
  3067.  
  3068.               /* We skipped bytes and skip the " : reset state */
  3069.               /*if (inscript) {
  3070.               inscript_state_pos = INSCRIPT_START;
  3071.               }*/
  3072.  
  3073.           }  // if (p) 
  3074.  
  3075.         }  // si '<' ou '>'
  3076.  
  3077.         // plus loin
  3078.         adr++;      // automate will be checked next loop
  3079.  
  3080.  
  3081.         /* Otimization: if we are scanning in HTML data (not in tag or script), 
  3082.         then jump to the next starting tag */
  3083.         if (ptr>0) {
  3084.           if ( (!intag)         /* Not in tag */
  3085.             && (!inscript)      /* Not in (java)script */
  3086.             && (!in_media)      /* Not in media */
  3087.             && (!incomment)     /* Not in comment (<!--) */
  3088.             && (!inscript_tag)  /* Not in tag with script inside */
  3089.             ) 
  3090.           {
  3091.             /* Not at the end */
  3092.             if (( ((int) (adr - r->adr)) ) < r->size) {
  3093.               /* Not on a starting tag yet */
  3094.               if (*adr != '<') {
  3095.                 /* strchr does not well behave with null chrs.. */
  3096.                 /* char* adr_next = strchr(adr,'<'); */
  3097.                 char* adr_next = adr;
  3098.                 while(*adr_next != '<' && (adr_next - r->adr) < r->size ) {
  3099.                   adr_next++;
  3100.                 }
  3101.                 /* Jump to near end (index hack) */
  3102.                 if (!adr_next || *adr_next != '<') {
  3103.                   if (
  3104.                     ( (int)(adr - r->adr) < (r->size - 4)) 
  3105.                     &&
  3106.                     (r->size > 4)
  3107.                     ) {
  3108.                       adr = r->adr + r->size - 2;
  3109.                     }
  3110.                 } else {
  3111.                   adr = adr_next;
  3112.                 }
  3113.               }
  3114.             }
  3115.           }
  3116.         }
  3117.  
  3118.         // ----------
  3119.         // Θcrire peu α peu
  3120.         if ((opt->getmode & 1) && (ptr>0)) HT_ADD_ADR;
  3121.         lastsaved=adr;    // dernier Θcrit+1
  3122.         // ----------
  3123.  
  3124.         // Checks
  3125.         if (back_add_stats != opt->state.back_add_stats) {
  3126.           back_add_stats = opt->state.back_add_stats;
  3127.  
  3128.           // Check max time
  3129.           if (!back_checkmirror(opt)) {
  3130.             adr = r->adr + r->size;
  3131.           }
  3132.         }
  3133.  
  3134.         // pour les stats du shell si parsing trop long
  3135.         if (r->size)
  3136.           opt->state._hts_in_html_done=(100 * ((int) (adr - r->adr)) ) / (int)(r->size);
  3137.         if (opt->state._hts_in_html_poll) {
  3138.           opt->state._hts_in_html_poll=0;
  3139.           // temps α attendre, et remplir autant que l'on peut le cache (backing)
  3140.           back_wait(sback,opt,cache,HTS_STAT.stat_timestart);        
  3141.           back_fillmax(sback,opt,cache,liens,ptr,numero_passe,lien_tot);
  3142.  
  3143.           // Transfer rate
  3144.           engine_stats();
  3145.  
  3146.           // Refresh various stats
  3147.           HTS_STAT.stat_nsocket=back_nsoc(sback);
  3148.           HTS_STAT.stat_errors=fspc(opt, NULL,"error");
  3149.           HTS_STAT.stat_warnings=fspc(opt, NULL,"warning");
  3150.           HTS_STAT.stat_infos=fspc(opt, NULL,"info");
  3151.           HTS_STAT.nbk=backlinks_done(sback,liens,lien_tot,ptr);
  3152.           HTS_STAT.nb=back_transfered(HTS_STAT.stat_bytes,sback);
  3153.  
  3154.           if (!RUN_CALLBACK7(opt, loop, sback->lnk, sback->count, 0,ptr,lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) {
  3155.             if (opt->log) {
  3156.               HTS_LOG(opt,LOG_INFO); fprintf(opt->log,"Exit requested by shell or user"LF);
  3157.               test_flush;
  3158.             } 
  3159.             *stre->exit_xh_=1;  // exit requested
  3160.             XH_uninit;
  3161.             return -1;
  3162.             //adr = r->adr + r->size;  // exit
  3163.           } else if (opt->state._hts_cancel == 1) {
  3164.             // adr = r->adr + r->size;  // exit
  3165.             nofollow=1;               // moins violent
  3166.             opt->state._hts_cancel = 0;
  3167.           }
  3168.  
  3169.         }
  3170.  
  3171.         // refresh the backing system each 2 seconds
  3172.         if (engine_stats()) {
  3173.           back_wait(sback,opt,cache,HTS_STAT.stat_timestart);        
  3174.           back_fillmax(sback,opt,cache,liens,ptr,numero_passe,lien_tot);
  3175.         }
  3176.       } while(( ((int) (adr - r->adr)) ) < r->size);
  3177.  
  3178.             opt->state._hts_in_html_parsing=0;  // flag
  3179.       opt->state._hts_cancel=0;           // pas de cancel
  3180.  
  3181.             if ((opt->getmode & 1) && (ptr>0)) {
  3182.         {
  3183.           char* cAddr = ht_buff;
  3184.           int cSize = (int) ht_len;
  3185.           if ( (opt->debug>0) && (opt->log!=NULL) ) {
  3186.             HTS_LOG(opt,LOG_INFO); fprintf(opt->log,"engine: postprocess-html: %s%s"LF, urladr, urlfil);
  3187.           }
  3188.           if (RUN_CALLBACK4(opt, postprocess, &cAddr, &cSize, urladr, urlfil) == 1) {
  3189.             ht_buff = cAddr;
  3190.             ht_len = cSize;
  3191.           }
  3192.         }
  3193.  
  3194.         /* Flush and save to disk */
  3195.         HT_ADD_END;    // achever
  3196.       }
  3197.       //
  3198.       //
  3199.       //
  3200.     }  // if !error
  3201.  
  3202.  
  3203.     if (opt->getmode & 1) { if (fp) { fclose(fp); fp=NULL; } }
  3204.     // sauver fichier
  3205.     //structcheck(savename);
  3206.     //filesave(opt,r->adr,r->size,savename);
  3207.  
  3208.   }  // analyse OK
  3209.  
  3210.   /* Apply changes */
  3211.   ENGINE_SAVE_CONTEXT();
  3212.  
  3213.   return 0;
  3214. }
  3215.  
  3216.  
  3217.  
  3218.  
  3219. /*
  3220. Check 301, 302, .. statuscodes (moved)
  3221. */
  3222. int hts_mirror_check_moved(htsmoduleStruct* str, htsmoduleStructExtended* stre) {
  3223.   /* Load engine variables */
  3224.   ENGINE_LOAD_CONTEXT();  
  3225.  
  3226.   // DEBUT rattrapage des 301,302,307..
  3227.   // ------------------------------------------------------------
  3228.   if (!error) {
  3229.     ////////{
  3230.     // on a chargΘ un fichier en plus
  3231.     // if (!error) stat_loaded+=r.size;
  3232.  
  3233.     // ------------------------------------------------------------
  3234.     // Rattrapage des 301,302,307 (moved) et 412,416 - les 304 le sont dans le backing 
  3235.     // ------------------------------------------------------------
  3236.     if (HTTP_IS_REDIRECT(r->statuscode)) {          
  3237.         //if (r->adr!=NULL) {   // adr==null si fichier direct. [catch: davename normalement si cgi]
  3238.         //int i=0;
  3239.         char *rn=NULL;
  3240.         // char* p;
  3241.  
  3242.         if ( (opt->debug>0) && (opt->log!=NULL) ) {
  3243.           //if (opt->log) {
  3244.           HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"%s for %s%s"LF,r->msg,urladr,urlfil);
  3245.           test_flush;
  3246.         }
  3247.  
  3248.  
  3249.         {
  3250.           char BIGSTK mov_url[HTS_URLMAXSIZE*2],mov_adr[HTS_URLMAXSIZE*2],mov_fil[HTS_URLMAXSIZE*2];
  3251.           int get_it=0;         // ne pas prendre le fichier α la mΩme adresse par dΘfaut
  3252.           int reponse=0;
  3253.           mov_url[0]='\0'; mov_adr[0]='\0'; mov_fil[0]='\0';
  3254.           //
  3255.  
  3256.           strcpybuff(mov_url,r->location);
  3257.  
  3258.           // url qque -> adresse+fichier
  3259.           if ((reponse=ident_url_relatif(mov_url,urladr,urlfil,mov_adr,mov_fil))>=0) {                        
  3260.             int set_prio_to=0;    // pas de priotitΘ fixΘd par wizard
  3261.  
  3262.             // check whether URLHack is harmless or not
  3263.             if (opt->urlhack) {
  3264.               char BIGSTK n_adr[HTS_URLMAXSIZE*2], n_fil[HTS_URLMAXSIZE*2];
  3265.               char BIGSTK pn_adr[HTS_URLMAXSIZE*2], pn_fil[HTS_URLMAXSIZE*2];
  3266.               n_adr[0] = n_fil[0] = '\0';
  3267.               (void) adr_normalized(mov_adr, n_adr);
  3268.               (void) fil_normalized(mov_fil, n_fil);
  3269.               (void) adr_normalized(urladr, pn_adr);
  3270.               (void) fil_normalized(urlfil, pn_fil);
  3271.               if (strcasecmp(n_adr, pn_adr) == 0 && strcasecmp(n_fil, pn_fil) == 0) {
  3272.                 if (opt->log) {
  3273.                   HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"Redirected link is identical because of 'URL Hack' option: %s%s and %s%s"LF, urladr, urlfil, mov_adr, mov_fil);
  3274.                   test_flush;
  3275.                 }
  3276.               }
  3277.             }
  3278.  
  3279.             //if (ident_url_absolute(mov_url,mov_adr,mov_fil)!=-1) {    // ok URL reconnue
  3280.             // c'est (en gros) la mΩme URL..
  3281.             // si c'est un problΦme de casse dans le host c'est que le serveur est buggΘ
  3282.             // ("RFC says.." : host name IS case insensitive)
  3283.             if ((strfield2(mov_adr,urladr)!=0) && (strfield2(mov_fil,urlfil)!=0)) {  // identique α casse prΦs
  3284.               // on tourne en rond
  3285.               if (strcmp(mov_fil,urlfil)==0) {
  3286.                 error=1;
  3287.                 get_it=-1;        // ne rien faire
  3288.                 if (opt->log) {
  3289.                   HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"Can not bear crazy server (%s) for %s%s"LF,r->msg,urladr,urlfil);
  3290.                   test_flush;
  3291.                 }
  3292.               } else {    // mauvaise casse, effacer entrΘe dans la pile et rejouer une fois
  3293.                 get_it=1;
  3294.               }
  3295.             } else {        // adresse diffΘrente
  3296.               if (ishtml(opt,mov_url)==0) {   // pas mΩme adresse MAIS c'est un fichier non html (pas de page moved possible)
  3297.                 // -> on prend α cette adresse, le lien sera enregistrΘ avec lien_record() (hash)
  3298.                 if ((opt->debug>1) && (opt->log!=NULL)) {
  3299.                   HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"wizard link test for moved file at %s%s.."LF,mov_adr,mov_fil);
  3300.                   test_flush;
  3301.                 }
  3302.                 // acceptΘ?
  3303.                 if (hts_acceptlink(opt,ptr,lien_tot,liens,
  3304.                   mov_adr,mov_fil,
  3305.                   NULL, NULL,
  3306.                   &set_prio_to,
  3307.                   NULL) != 1) {                /* nouvelle adresse non refusΘe ? */
  3308.                     get_it=1;
  3309.                     if ((opt->debug>1) && (opt->log!=NULL)) {
  3310.                       HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"moved link accepted: %s%s"LF,mov_adr,mov_fil);
  3311.                       test_flush;
  3312.                     }
  3313.                   }
  3314.               } /* sinon traitΘ normalement */
  3315.             }
  3316.  
  3317.             //if ((strfield2(mov_adr,urladr)!=0) && (strfield2(mov_fil,urlfil)!=0)) {  // identique α casse prΦs
  3318.             if (get_it==1) {
  3319.               // court-circuiter le reste du traitement
  3320.               // et reculer pour mieux sauter
  3321.               if (opt->log) {
  3322.                 HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"Warning moved treated for %s%s (real one is %s%s)"LF,urladr,urlfil,mov_adr,mov_fil);
  3323.                 test_flush;
  3324.               }          
  3325.               // canceller lien actuel
  3326.               error=1;
  3327.               strcpybuff(liens[ptr]->adr,"!");  // caractΦre bidon (invalide hash)
  3328.               // noter NOUVEAU lien
  3329.               //xxc xxc
  3330.               //  set_prio_to=0+1;  // protection if the moved URL is an html page!!
  3331.               //xxc xxc
  3332.               {
  3333.                 char BIGSTK mov_sav[HTS_URLMAXSIZE*2];
  3334.                 // calculer lien et Θventuellement modifier addresse/fichier
  3335.                 if (url_savename(mov_adr,mov_fil,mov_sav,NULL,NULL,liens[liens[ptr]->precedent]->adr,liens[liens[ptr]->precedent]->fil,opt,liens,lien_tot,sback,cache,hash,ptr,numero_passe,NULL)!=-1) { 
  3336.                   if (hash_read(hash,mov_sav,"",0,0)<0) {      // n'existe pas dΘja
  3337.                     // enregistrer lien (MACRO) avec SAV IDENTIQUE
  3338.                     liens_record(mov_adr,mov_fil,liens[ptr]->sav,"","");
  3339.                     //liens_record(mov_adr,mov_fil,mov_sav,"","");
  3340.                     if (liens[lien_tot]!=NULL) {    // OK, pas d'erreur
  3341.                       // mode test?
  3342.                       liens[lien_tot]->testmode=liens[ptr]->testmode;
  3343.                       liens[lien_tot]->link_import=0;       // mode normal
  3344.                       if (!set_prio_to)
  3345.                         liens[lien_tot]->depth=liens[ptr]->depth;
  3346.                       else
  3347.                         liens[lien_tot]->depth=max(0,min(set_prio_to-1,liens[ptr]->depth));       // PRIORITE NULLE (catch page)
  3348.                       liens[lien_tot]->pass2=max(liens[ptr]->pass2,numero_passe);
  3349.                       liens[lien_tot]->retry=liens[ptr]->retry;
  3350.                       liens[lien_tot]->premier=liens[ptr]->premier;
  3351.                       liens[lien_tot]->precedent=liens[ptr]->precedent;
  3352.                       lien_tot++;
  3353.                     } else {  // oups erreur, plus de mΘmoire!!
  3354.                       printf("PANIC! : Not enough memory [%d]\n",__LINE__);
  3355.                       if (opt->log) {
  3356.                         fprintf(opt->log,"Not enough memory, can not re-allocate %d bytes"LF,(int)((add_tab_alloc+1)*sizeof(lien_url)));
  3357.                         test_flush;
  3358.                       }
  3359.                       //if (opt->getmode & 1) { if (fp) { fclose(fp); fp=NULL; } }
  3360.                       XH_uninit;    // dΘsallocation mΘmoire & buffers
  3361.                       return 0;
  3362.                     }
  3363.                   } else {
  3364.                     if ( (opt->debug>0) && (opt->log!=NULL) ) {
  3365.                       HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"moving %s to an existing file %s"LF,liens[ptr]->fil,urlfil);
  3366.                       test_flush;
  3367.                     }
  3368.                   }
  3369.  
  3370.                 }
  3371.               }
  3372.  
  3373.               //printf("-> %s %s %s\n",liens[lien_tot-1]->adr,liens[lien_tot-1]->fil,liens[lien_tot-1]->sav);
  3374.  
  3375.               // note mΘtaphysique: il se peut qu'il y ait un index.html et un INDEX.HTML
  3376.               // sous DOS ca marche pas trΦs bien... mais comme je suis gΘnial url_savename()
  3377.               // est α mΩme de rΘgler ce problΦme
  3378.             }
  3379.           } // ident_url_xx
  3380.  
  3381.           if (get_it==0) {    // adresse vraiment diffΘrente et potentiellement en html (pas de possibilitΘ de bouger la page tel quel α cause des <img src..> et cie)
  3382.             rn=(char*) calloct(8192,1);
  3383.             if (rn!=NULL) {
  3384.               if (opt->log) {
  3385.                 HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"File has moved from %s%s to %s"LF,urladr,urlfil,mov_url);
  3386.                 test_flush;
  3387.               }
  3388.               if (!opt->mimehtml) {
  3389.                 escape_uri(mov_url);
  3390.               } else {
  3391.                 char BIGSTK buff[HTS_URLMAXSIZE*3];
  3392.                 strcpybuff(buff, mov_adr);
  3393.                 strcatbuff(buff, mov_fil);
  3394.                 escape_in_url(buff);
  3395.                 { char* a = buff; while((a = strchr(a, '%'))) { *a = 'X'; a++; } }
  3396.                 strcpybuff(mov_url, "cid:");
  3397.                 strcatbuff(mov_url, buff);
  3398.               }
  3399.               // On prΘpare une page qui sautera immΘdiatement sur la bonne URL
  3400.               // Le scanner re-changera, ensuite, cette URL, pour la mirrorer!
  3401.               strcpybuff(rn,"<HTML>"CRLF);
  3402.               strcatbuff(rn,"<!-- Created by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->"CRLF);
  3403.               strcatbuff(rn,"<HEAD>"CRLF"<TITLE>Page has moved</TITLE>"CRLF"</HEAD>"CRLF"<BODY>"CRLF);
  3404.               strcatbuff(rn,"<META HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=");
  3405.               strcatbuff(rn,mov_url);    // URL
  3406.               strcatbuff(rn,"\">"CRLF);
  3407.               strcatbuff(rn,"<A HREF=\"");
  3408.               strcatbuff(rn,mov_url);
  3409.               strcatbuff(rn,"\">");
  3410.               strcatbuff(rn,"<B>Click here...</B></A>"CRLF);
  3411.               strcatbuff(rn,"</BODY>"CRLF);
  3412.               strcatbuff(rn,"<!-- Created by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->"CRLF);
  3413.               strcatbuff(rn,"</HTML>"CRLF);
  3414.  
  3415.               // changer la page
  3416.               if (r->adr) { 
  3417.                 freet(r->adr); 
  3418.                 r->adr=NULL; 
  3419.               }
  3420.               r->adr=rn;
  3421.               r->size=strlen(r->adr);
  3422.               strcpybuff(r->contenttype, "text/html");
  3423.             }
  3424.           }  // get_it==0
  3425.  
  3426.         }     // bloc
  3427.         // erreur HTTP (ex: 404, not found)
  3428.       } else if (
  3429.         (r->statuscode==412)
  3430.         || (r->statuscode==416)
  3431.         ) {    // Precondition Failed, c'est α dire pour nous redemander TOUT le fichier
  3432.           if (fexist(liens[ptr]->sav)) {
  3433.             remove(liens[ptr]->sav);    // Eliminer
  3434.             if (!fexist(liens[ptr]->sav)) {  // Bien ΘliminΘ? (sinon on boucle..)
  3435. #if HDEBUG
  3436.               printf("Partial content NOT up-to-date, reget all file for %s\n",liens[ptr]->sav);
  3437. #endif
  3438.               if ( (opt->debug>1) && (opt->log!=NULL) ) {
  3439.                 //if (opt->log) {
  3440.                 HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"Partial file reget (%s) for %s%s"LF,r->msg,urladr,urlfil);
  3441.                 test_flush;
  3442.               }
  3443.               // enregistrer le MEME lien (MACRO)
  3444.               liens_record(liens[ptr]->adr,liens[ptr]->fil,liens[ptr]->sav,"","");
  3445.               if (liens[lien_tot]!=NULL) {    // OK, pas d'erreur
  3446.                 liens[lien_tot]->testmode=liens[ptr]->testmode;          // mode test?
  3447.                 liens[lien_tot]->link_import=0;       // pas mode import
  3448.                 liens[lien_tot]->depth=liens[ptr]->depth;
  3449.                 liens[lien_tot]->pass2=max(liens[ptr]->pass2,numero_passe);
  3450.                 liens[lien_tot]->retry=liens[ptr]->retry;
  3451.                 liens[lien_tot]->premier=liens[ptr]->premier;
  3452.                 liens[lien_tot]->precedent=ptr;
  3453.                 lien_tot++;
  3454.                 //
  3455.                 // canceller lien actuel
  3456.                 error=1;
  3457.                 strcpybuff(liens[ptr]->adr,"!");  // caractΦre bidon (invalide hash)
  3458.                 //
  3459.               } else {  // oups erreur, plus de mΘmoire!!
  3460.                 printf("PANIC! : Not enough memory [%d]\n",__LINE__);
  3461.                 if (opt->log) {
  3462.                   fprintf(opt->log,"Not enough memory, can not re-allocate %d bytes"LF,(int)((add_tab_alloc+1)*sizeof(lien_url)));
  3463.                   test_flush;
  3464.                 }
  3465.                 //if (opt->getmode & 1) { if (fp) { fclose(fp); fp=NULL; } }
  3466.                 XH_uninit;    // dΘsallocation mΘmoire & buffers
  3467.                 return 0;
  3468.               } 
  3469.             } else {
  3470.               if (opt->log!=NULL) {
  3471.                 HTS_LOG(opt,LOG_ERROR); fprintf(opt->log,"Can not remove old file %s"LF,urlfil);
  3472.                 test_flush;
  3473.               }
  3474.             }
  3475.           } else {
  3476.             if (opt->log!=NULL) {
  3477.               HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"Unexpected 412/416 error (%s) for %s%s"LF,r->msg,urladr,urlfil);
  3478.               test_flush;
  3479.             }
  3480.           }
  3481.         } else if (r->statuscode!=HTTP_OK) {
  3482.           int can_retry=0;
  3483.  
  3484.           // cas o∙ l'on peut reessayer
  3485.           switch(r->statuscode) {
  3486.             //case -1: can_retry=1; break;
  3487.           case STATUSCODE_TIMEOUT:
  3488.             if (opt->hostcontrol) {    // timeout et retry ΘpuisΘs
  3489.               if ((opt->hostcontrol & 1) && (liens[ptr]->retry<=0)) {
  3490.                 if ((opt->debug>1) && (opt->log!=NULL)) {
  3491.                   HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"Link banned: %s%s"LF,urladr,urlfil); test_flush;
  3492.                 }
  3493.                 host_ban(opt,liens,ptr,lien_tot,sback,jump_identification(urladr));
  3494.                 if ((opt->debug>1) && (opt->log!=NULL)) {
  3495.                   HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"Info: previous log - link banned: %s%s"LF,urladr,urlfil); test_flush;
  3496.                 }
  3497.               } else can_retry=1;
  3498.             } else can_retry=1;
  3499.             break;
  3500.           case STATUSCODE_SLOW:
  3501.             if ((opt->hostcontrol) && (liens[ptr]->retry<=0)) {    // too slow
  3502.               if (opt->hostcontrol & 2) {
  3503.                 if ((opt->debug>1) && (opt->log!=NULL)) {
  3504.                   HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"Link banned: %s%s"LF,urladr,urlfil); test_flush;
  3505.                 }
  3506.                 host_ban(opt,liens,ptr,lien_tot,sback,jump_identification(urladr));
  3507.                 if ((opt->debug>1) && (opt->log!=NULL)) {
  3508.                   HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"Info: previous log - link banned: %s%s"LF,urladr,urlfil); test_flush;
  3509.                 }
  3510.               } else can_retry=1;
  3511.             } else can_retry=1;
  3512.             break;
  3513.           case STATUSCODE_CONNERROR:            // connect closed
  3514.             can_retry=1;
  3515.             break;
  3516.           case STATUSCODE_NON_FATAL:            // other (non fatal) error
  3517.             can_retry=1;
  3518.             break;
  3519.           case STATUSCODE_SSL_HANDSHAKE:            // bad SSL handskake
  3520.             can_retry=1;
  3521.             break;
  3522.           case 408: case 409: case 500: case 502: case 504: 
  3523.             can_retry=1;
  3524.             break;
  3525.           }
  3526.  
  3527.           if ( strcmp(liens[ptr]->fil,"/primary") != 0 ) {  // no primary (internal page 0)
  3528.             if ((liens[ptr]->retry<=0) || (!can_retry) ) {  // retry ΘpuisΘs (ou retry impossible)
  3529.               if (opt->log) {
  3530.                 if ((opt->retry>0) && (can_retry)){
  3531.                   HTS_LOG(opt,LOG_ERROR); 
  3532.                   fprintf(opt->log,"\"%s\" (%d) after %d retries at link %s%s (from %s%s)"LF,r->msg,r->statuscode,opt->retry,urladr,urlfil,liens[liens[ptr]->precedent]->adr,liens[liens[ptr]->precedent]->fil);
  3533.                 } else {
  3534.                   if (r->statuscode==STATUSCODE_TEST_OK) {    // test OK
  3535.                     if ((opt->debug>0) && (opt->log!=NULL)) {
  3536.                       HTS_LOG(opt,LOG_INFO); 
  3537.                       fprintf(opt->log,"Test OK at link %s%s (from %s%s)"LF,urladr,urlfil,liens[liens[ptr]->precedent]->adr,liens[liens[ptr]->precedent]->fil);
  3538.                     }
  3539.                   } else {
  3540.                     if (strcmp(urlfil,"/robots.txt")) {       // ne pas afficher d'infos sur robots.txt par dΘfaut
  3541.                       HTS_LOG(opt,LOG_ERROR); 
  3542.                       fprintf(opt->log,"\"%s\" (%d) at link %s%s (from %s%s)"LF,r->msg,r->statuscode,urladr,urlfil,liens[liens[ptr]->precedent]->adr,liens[liens[ptr]->precedent]->fil);
  3543.                     } else {
  3544.                       if (opt->debug>1) {
  3545.                         HTS_LOG(opt,LOG_INFO); fprintf(opt->log,"No robots.txt rules at %s"LF,urladr);
  3546.                         test_flush;
  3547.                       }
  3548.                     }
  3549.                   }
  3550.                 }
  3551.                 test_flush;
  3552.               }
  3553.  
  3554.               // NO error in trop level
  3555.               // due to the "no connection -> previous restored" hack
  3556.               // This prevent the engine from wiping all data if the website has been deleted (or moved)
  3557.               // since last time (which is quite annoying)
  3558.               if (liens[ptr]->precedent != 0) {
  3559.                 // ici on teste si on doit enregistrer la page tout de mΩme
  3560.                 if (opt->errpage) {
  3561.                   store_errpage=1;
  3562.                 }
  3563.               } else {
  3564.                 if (strcmp(urlfil,"/robots.txt") != 0) {
  3565.                   /*
  3566.                   This is an error caused by a link entered by the user
  3567.                   That is, link(s) entered by user are invalid (404, 500, connect error, proxy error->.)
  3568.                   If all links entered are invalid, the session failed and we will attempt to restore
  3569.                   the previous one
  3570.                   Example: Try to update a website which has been deleted remotely: this may delete
  3571.                   the website locally, which is really not desired (especially if the website disappeared!)
  3572.                   With this hack, the engine won't wipe local files (how clever)
  3573.                   */
  3574.                   HTS_STAT.stat_errors_front++;
  3575.                 }
  3576.               }
  3577.  
  3578.             } else {    // retry!!
  3579.               if (opt->debug>0 && opt->log != NULL) {  // on fera un alert si le retry Θchoue               
  3580.                 HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"Retry after error %d (%s) at link %s%s (from %s%s)"LF,r->statuscode,r->msg,urladr,urlfil,liens[liens[ptr]->precedent]->adr,liens[liens[ptr]->precedent]->fil);
  3581.                 test_flush;
  3582.               }
  3583.               // redemander fichier
  3584.               liens_record(urladr,urlfil,savename,"","");
  3585.               if (liens[lien_tot]!=NULL) {    // OK, pas d'erreur
  3586.                 liens[lien_tot]->testmode=liens[ptr]->testmode;          // mode test?
  3587.                 liens[lien_tot]->link_import=0;       // pas mode import
  3588.                 liens[lien_tot]->depth=liens[ptr]->depth;
  3589.                 liens[lien_tot]->pass2=max(liens[ptr]->pass2,numero_passe);
  3590.                 liens[lien_tot]->retry=liens[ptr]->retry-1;    // moins 1 retry!
  3591.                 liens[lien_tot]->premier=liens[ptr]->premier;
  3592.                 liens[lien_tot]->precedent=liens[ptr]->precedent;
  3593.                 lien_tot++;
  3594.               } else {  // oups erreur, plus de mΘmoire!!
  3595.                 printf("PANIC! : Not enough memory [%d]\n",__LINE__);
  3596.                 if (opt->log) {
  3597.                   HTS_LOG(opt,LOG_PANIC); 
  3598.                   fprintf(opt->log,"Not enough memory, can not re-allocate %d bytes"LF,(int)((add_tab_alloc+1)*sizeof(lien_url)));
  3599.                   test_flush;
  3600.                 }
  3601.                 //if (opt->getmode & 1) { if (fp) { fclose(fp); fp=NULL; } }
  3602.                 XH_uninit;    // dΘsallocation mΘmoire & buffers
  3603.                 return 0;
  3604.               } 
  3605.             }
  3606.           } else {
  3607.             if (opt->log) {
  3608.               if (opt->debug>1) {
  3609.                 HTS_LOG(opt,LOG_INFO); 
  3610.                 fprintf(opt->log,"Info: no robots.txt at %s%s"LF,urladr,urlfil);
  3611.               }
  3612.             }
  3613.           }
  3614.           if (!store_errpage) {
  3615.             if (r->adr) {     // dΘsalloc
  3616.               freet(r->adr); 
  3617.               r->adr=NULL; 
  3618.             }
  3619.             error=1;  // erreur!
  3620.           }
  3621.         }
  3622.         // FIN rattrapage des 301,302,307..
  3623.         // ------------------------------------------------------------
  3624.  
  3625.   }  // if !error
  3626.  
  3627.  
  3628.   /* Apply changes */
  3629.   ENGINE_SAVE_CONTEXT();
  3630.  
  3631.   return 0;
  3632.  
  3633.  
  3634. }
  3635.  
  3636. /*
  3637.   Process pause, link adding..
  3638. */
  3639. void hts_mirror_process_user_interaction(htsmoduleStruct* str, htsmoduleStructExtended* stre) {
  3640.   int b;
  3641.   /* Load engine variables */
  3642.   ENGINE_LOAD_CONTEXT();
  3643.  
  3644. #if BDEBUG==1
  3645.   printf("\nBack test..\n");
  3646. #endif
  3647.  
  3648.   // pause/lock files
  3649.   {
  3650.     int do_pause=0;
  3651.  
  3652.     // user pause lockfile : create hts-paused.lock --> HTTrack will be paused
  3653.     if (fexist(fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_log),"hts-stop.lock"))) {
  3654.       // remove lockfile
  3655.       remove(fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_log),"hts-stop.lock"));
  3656.       if (!fexist(fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_log),"hts-stop.lock"))) {
  3657.         do_pause=1;
  3658.       }
  3659.     }
  3660.  
  3661.     // after receving N bytes, pause
  3662.     if (opt->fragment>0) {
  3663.       if ((HTS_STAT.stat_bytes-stat_fragment) > opt->fragment) {
  3664.         do_pause=1;
  3665.       }
  3666.     }
  3667.  
  3668.     // pause?
  3669.     if (do_pause) {
  3670.       if ( (opt->debug>0) && (opt->log!=NULL) ) {
  3671.         HTS_LOG(opt,LOG_INFO); fprintf(opt->log,"engine: pause requested.."LF);
  3672.       }
  3673.       while (back_nsoc(sback)>0) {                  // attendre fin des transferts
  3674.         back_wait(sback,opt,cache,HTS_STAT.stat_timestart);
  3675.         Sleep(200);
  3676.         {
  3677.           back_wait(sback,opt,cache,HTS_STAT.stat_timestart);
  3678.  
  3679.           // Transfer rate
  3680.           engine_stats();
  3681.  
  3682.           // Refresh various stats
  3683.           HTS_STAT.stat_nsocket=back_nsoc(sback);
  3684.           HTS_STAT.stat_errors=fspc(opt,NULL,"error");
  3685.           HTS_STAT.stat_warnings=fspc(opt,NULL,"warning");
  3686.           HTS_STAT.stat_infos=fspc(opt,NULL,"info");
  3687.           HTS_STAT.nbk=backlinks_done(sback,liens,lien_tot,ptr);
  3688.           HTS_STAT.nb=back_transfered(HTS_STAT.stat_bytes,sback);
  3689.  
  3690.           b=0;
  3691.           if (!RUN_CALLBACK7(opt, loop, sback->lnk, sback->count, b,ptr,lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)
  3692.             || !back_checkmirror(opt)) {
  3693.               if (opt->log) {
  3694.                 HTS_LOG(opt,LOG_INFO); fprintf(opt->log,"Exit requested by shell or user"LF);
  3695.                 test_flush;
  3696.               }
  3697.               *stre->exit_xh_=1;  // exit requested
  3698.               XH_uninit;
  3699.               return ;
  3700.             }
  3701.         }
  3702.       }
  3703.       // On dΘsalloue le buffer d'enregistrement des chemins crΘΘe, au cas o∙ pendant la pause
  3704.       // l'utilisateur ferait un rm -r aprΦs avoir effectuΘ un tar
  3705.       // structcheck_init(1);
  3706.       {
  3707.         FILE* fp = fopen(fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_log),"hts-paused.lock"),"wb");
  3708.         if (fp) {
  3709.           fspc(NULL,fp,"info");  // dater
  3710.           fprintf(fp,"Pause"LF"HTTrack is paused after retreiving "LLintP" bytes"LF"Delete this file to continue the mirror->.."LF""LF"",(LLint)HTS_STAT.stat_bytes);
  3711.           fclose(fp);
  3712.         }
  3713.       }
  3714.       stat_fragment=HTS_STAT.stat_bytes;
  3715.       /* Info for wrappers */
  3716.       if ( (opt->debug>0) && (opt->log!=NULL) ) {
  3717.         HTS_LOG(opt,LOG_INFO); fprintf(opt->log,"engine: pause: %s"LF,fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_log),"hts-paused.lock"));
  3718.       }
  3719.       RUN_CALLBACK1(opt, pause, fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_log),"hts-paused.lock"));
  3720.     }
  3721.     //
  3722.   }
  3723.   // end of pause/lock files
  3724.  
  3725.   // changement dans les prΘfΘrences
  3726.   if (opt->state._hts_addurl) {
  3727.     char BIGSTK add_adr[HTS_URLMAXSIZE*2];
  3728.     char BIGSTK add_fil[HTS_URLMAXSIZE*2];
  3729.     while(*opt->state._hts_addurl) {
  3730.       char BIGSTK add_url[HTS_URLMAXSIZE*2];
  3731.       add_adr[0]=add_fil[0]=add_url[0]='\0';
  3732.       if (!link_has_authority(*opt->state._hts_addurl))
  3733.         strcpybuff(add_url,"http://");          // ajouter http://
  3734.       strcatbuff(add_url,*opt->state._hts_addurl);
  3735.       if (ident_url_absolute(add_url,add_adr,add_fil)>=0) {
  3736.         // ----Ajout----
  3737.         // noter NOUVEAU lien
  3738.         char BIGSTK add_sav[HTS_URLMAXSIZE*2];
  3739.         // calculer lien et Θventuellement modifier addresse/fichier
  3740.         if (url_savename(add_adr,add_fil,add_sav,NULL,NULL,NULL,NULL,opt,liens,lien_tot,sback,cache,hash,ptr,numero_passe,NULL)!=-1) { 
  3741.           if (hash_read(hash,add_sav,"",0,0)<0) {      // n'existe pas dΘja
  3742.             // enregistrer lien (MACRO)
  3743.             liens_record(add_adr,add_fil,add_sav,"","");
  3744.             if (liens[lien_tot]!=NULL) {    // OK, pas d'erreur
  3745.               liens[lien_tot]->testmode=0;          // mode test?
  3746.               liens[lien_tot]->link_import=0;       // mode normal
  3747.               liens[lien_tot]->depth=opt->depth;
  3748.               liens[lien_tot]->pass2=max(0,numero_passe);
  3749.               liens[lien_tot]->retry=opt->retry;
  3750.               liens[lien_tot]->premier=lien_tot;
  3751.               liens[lien_tot]->precedent=lien_tot;
  3752.               lien_tot++;
  3753.               //
  3754.               if ((opt->debug>0) && (opt->log!=NULL)) {
  3755.                 HTS_LOG(opt,LOG_INFO); fprintf(opt->log,"Link added by user: %s%s"LF,add_adr,add_fil); test_flush;
  3756.               }
  3757.               //
  3758.             } else {  // oups erreur, plus de mΘmoire!!
  3759.               printf("PANIC! : Not enough memory [%d]\n",__LINE__);
  3760.               if (opt->log) {
  3761.                 fprintf(opt->log,"Not enough memory, can not re-allocate %d bytes"LF,(int)((add_tab_alloc+1)*sizeof(lien_url)));
  3762.                 test_flush;
  3763.               }
  3764.               //if (opt->getmode & 1) { if (fp) { fclose(fp); fp=NULL; } }
  3765.               XH_uninit;    // dΘsallocation mΘmoire & buffers
  3766.               return ;
  3767.             }
  3768.           } else {
  3769.             if ( (opt->debug>0) && (opt->log!=NULL) ) {
  3770.               HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"Existing link %s%s not added after user request"LF,add_adr,add_fil);
  3771.               test_flush;
  3772.             }
  3773.           }
  3774.  
  3775.         }
  3776.       } else {
  3777.         if (opt->log) {
  3778.           HTS_LOG(opt,LOG_ERROR);
  3779.           fprintf(opt->log,"Error during URL decoding for %s"LF,add_url);
  3780.           test_flush;
  3781.         }
  3782.       }
  3783.       // ----Fin Ajout----
  3784.       opt->state._hts_addurl++;                  // suivante
  3785.     }
  3786.     opt->state._hts_addurl=NULL;           // libΘrer _hts_addurl
  3787.   }
  3788.   // si une pause a ΘtΘ demandΘe
  3789.   if (opt->state._hts_setpause || back_pluggable_sockets_strict(sback, opt) <= 0) {
  3790.     // index du lien actuel
  3791.     int b=back_index(opt,sback,urladr,urlfil,savename);
  3792.     int prev = opt->state._hts_in_html_parsing;
  3793.     if (b<0) b=0;    // forcer pour les stats
  3794.     while(opt->state._hts_setpause || back_pluggable_sockets_strict(sback, opt) <= 0) {    // on fait la pause..
  3795.       opt->state._hts_in_html_parsing = 6;
  3796.       back_wait(sback,opt,cache,HTS_STAT.stat_timestart);
  3797.  
  3798.       // Transfer rate
  3799.       engine_stats();
  3800.  
  3801.       // Refresh various stats
  3802.       HTS_STAT.stat_nsocket=back_nsoc(sback);
  3803.       HTS_STAT.stat_errors=fspc(opt,NULL,"error");
  3804.       HTS_STAT.stat_warnings=fspc(opt,NULL,"warning");
  3805.       HTS_STAT.stat_infos=fspc(opt,NULL,"info");
  3806.       HTS_STAT.nbk=backlinks_done(sback,liens,lien_tot,ptr);
  3807.       HTS_STAT.nb=back_transfered(HTS_STAT.stat_bytes,sback);
  3808.  
  3809.       if (!RUN_CALLBACK7(opt, loop, sback->lnk, sback->count, b,ptr,lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) {
  3810.         if (opt->log) {
  3811.           HTS_LOG(opt,LOG_INFO); fprintf(opt->log,"Exit requested by shell or user"LF);
  3812.           test_flush;
  3813.         }
  3814.         *stre->exit_xh_=1;  // exit requested
  3815.         XH_uninit;
  3816.         return ;
  3817.       }
  3818.       Sleep(100);  // pause
  3819.     }
  3820.     opt->state._hts_in_html_parsing = prev;
  3821.   }
  3822.   ENGINE_SAVE_CONTEXT();
  3823.   return ;
  3824. }
  3825.  
  3826. /*
  3827. Wait for next file and
  3828. check 301, 302, .. statuscodes (moved)
  3829. */
  3830. int hts_mirror_wait_for_next_file(htsmoduleStruct* str, htsmoduleStructExtended* stre) {
  3831.   /* Load engine variables */
  3832.   ENGINE_DEFINE_CONTEXT();
  3833.   int b;
  3834.   int n;
  3835.  
  3836.   /* User interaction */
  3837.   ENGINE_SAVE_CONTEXT();
  3838.   {
  3839.     hts_mirror_process_user_interaction(str, stre);
  3840.   }
  3841.   ENGINE_SET_CONTEXT();
  3842.  
  3843.   // si le fichier n'est pas en backing, le mettre..
  3844.   if (!back_exist(str->sback,str->opt,urladr,urlfil,savename)) {
  3845. #if BDEBUG==1
  3846.     printf("crash backing: %s%s\n",liens[ptr]->adr,liens[ptr]->fil);
  3847. #endif
  3848.     if (back_add(sback,opt,cache,urladr,urlfil,savename,liens[liens[ptr]->precedent]->adr,liens[liens[ptr]->precedent]->fil,liens[ptr]->testmode)==-1) {
  3849.       printf("PANIC! : Crash adding error, unexpected error found.. [%d]\n",__LINE__);
  3850. #if BDEBUG==1
  3851.       printf("error while crash adding\n");
  3852. #endif
  3853.       if (opt->log) {
  3854.         HTS_LOG(opt,LOG_ERROR); fprintf(opt->log,"Unexpected backing error for %s%s"LF,urladr,urlfil);
  3855.         test_flush;
  3856.       } 
  3857.  
  3858.     }
  3859.   }
  3860.  
  3861. #if BDEBUG==1
  3862.   printf("test number of socks\n");
  3863. #endif
  3864.  
  3865.   // ajouter autant de socket qu'on peut ajouter
  3866.   n=opt->maxsoc-back_nsoc(sback);
  3867. #if BDEBUG==1
  3868.   printf("%d sockets available for backing\n",n);
  3869. #endif
  3870.  
  3871.   if ((n>0) && (!opt->state._hts_setpause)) {   // si sockets libre et pas en pause, ajouter
  3872.     // remplir autant que l'on peut le cache (backing)
  3873.     back_fillmax(sback,opt,cache,liens,ptr,numero_passe,lien_tot);
  3874.   }
  3875.  
  3876.   // index du lien actuel
  3877.   {
  3878.     // ------------------------------------------------------------
  3879.     // attendre que le fichier actuel soit prΩt - BOUCLE D'ATTENTE
  3880.     do {
  3881.       /* User interaction */
  3882.       ENGINE_SAVE_CONTEXT();
  3883.       {
  3884.         hts_mirror_process_user_interaction(str, stre);
  3885.       }
  3886.       ENGINE_SET_CONTEXT();
  3887.  
  3888.       // index du lien actuel
  3889.       b=back_index(opt,sback,urladr,urlfil,savename);
  3890. #if BDEBUG==1
  3891.       printf("back index %d, waiting\n",b);
  3892. #endif
  3893.       // Continue to the loop if link still present
  3894.       if (b<0)
  3895.         break;
  3896.  
  3897.       // Receive data
  3898.       if (back[b].status>0)
  3899.         back_wait(sback,opt,cache,HTS_STAT.stat_timestart);
  3900.  
  3901.       // Continue to the loop if link still present
  3902.       b=back_index(opt,sback,urladr,urlfil,savename);
  3903.       if (b<0)
  3904.         break;
  3905.  
  3906.       // Stop the mirror
  3907.       if (!back_checkmirror(opt)) {
  3908.         *stre->exit_xh_=1;  // exit requested
  3909.         XH_uninit;
  3910.         return 0;
  3911.       }
  3912.  
  3913.       // And fill the backing stack
  3914.       if (back[b].status>0)
  3915.         back_fillmax(sback,opt,cache,liens,ptr,numero_passe,lien_tot);
  3916.  
  3917.       // Continue to the loop if link still present
  3918.       b=back_index(opt,sback,urladr,urlfil,savename);
  3919.       if (b<0)
  3920.         break;
  3921.  
  3922.       // autres occupations de HTTrack: statistiques, boucle d'attente, etc.
  3923.       if ((opt->makestat) || (opt->maketrack)) {
  3924.         TStamp l=time_local();
  3925.         if ((int) (l-makestat_time) >= 60) {   
  3926.           if (makestat_fp != NULL) {
  3927.             fspc(NULL,makestat_fp,"info");
  3928.             fprintf(makestat_fp,"Rate= %d (/"LLintP") \11NewLinks= %d (/%d)"LF,(int) ((HTS_STAT.HTS_TOTAL_RECV-*stre->makestat_total_)/(l-makestat_time)), (LLint)HTS_STAT.HTS_TOTAL_RECV,(int) lien_tot-*stre->makestat_lnk_,(int) lien_tot);
  3929.             fflush(makestat_fp);
  3930.             *stre->makestat_total_=HTS_STAT.HTS_TOTAL_RECV;
  3931.             *stre->makestat_lnk_=lien_tot;
  3932.           }
  3933.           if (stre->maketrack_fp != NULL) {
  3934.             int i;
  3935.             fspc(NULL,stre->maketrack_fp,"info"); fprintf(stre->maketrack_fp,LF);
  3936.             for(i=0;i<back_max;i++) {
  3937.               back_info(sback,i,3,stre->maketrack_fp);
  3938.             }
  3939.             fprintf(stre->maketrack_fp,LF);
  3940.             fflush(stre->maketrack_fp);
  3941.  
  3942.           }
  3943.           makestat_time=l;
  3944.         }
  3945.       }
  3946.  
  3947.       /* cancel links */
  3948.             {
  3949.         int i;
  3950.         char* s;
  3951.         while(( s = hts_cancel_file_pop(opt) ) != NULL) {
  3952.           if (strnotempty(s)) {    // fichier α canceller
  3953.             for(i = 0 ; i < back_max ; i++) {
  3954.               if ((back[i].status > 0)) {
  3955.                 if (strcmp(back[i].url_sav,s) == 0) {  // ok trouvΘ
  3956.                   if (back[i].status != 1000) {
  3957. #if HTS_DEBUG_CLOSESOCK
  3958.                     DEBUG_W("user cancel: deletehttp\n");
  3959. #endif
  3960.                     if (back[i].r.soc!=INVALID_SOCKET)
  3961.                       deletehttp(&back[i].r);
  3962.                     back[i].r.soc=INVALID_SOCKET;
  3963.                     back[i].r.statuscode=STATUSCODE_INVALID;
  3964.                     strcpybuff(back[i].r.msg,"Cancelled by User");
  3965.                     back[i].status=0;  // terminΘ
  3966.                     back_set_finished(sback, i);
  3967.                   } else    // cancel ftp.. flag α 1
  3968.                     back[i].stop_ftp = 1;
  3969.                 }
  3970.               }
  3971.             }
  3972.             s[0]='\0';
  3973.           }
  3974.           freet(s);
  3975.         }
  3976.  
  3977.         // Transfer rate
  3978.         engine_stats();
  3979.  
  3980.         // Refresh various stats
  3981.         HTS_STAT.stat_nsocket=back_nsoc(sback);
  3982.         HTS_STAT.stat_errors=fspc(opt,NULL,"error");
  3983.         HTS_STAT.stat_warnings=fspc(opt,NULL,"warning");
  3984.         HTS_STAT.stat_infos=fspc(opt,NULL,"info");
  3985.         HTS_STAT.nbk=backlinks_done(sback,liens,lien_tot,ptr);
  3986.         HTS_STAT.nb=back_transfered(HTS_STAT.stat_bytes,sback);
  3987.  
  3988.         if (!RUN_CALLBACK7(opt, loop, sback->lnk, sback->count, b,ptr,lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) {
  3989.           if (opt->log) {
  3990.             HTS_LOG(opt,LOG_INFO); fprintf(opt->log,"Exit requested by shell or user"LF);
  3991.             test_flush;
  3992.           } 
  3993.           *stre->exit_xh_=1;  // exit requested
  3994.           XH_uninit;
  3995.           return 0;
  3996.         }
  3997.  
  3998.       }
  3999.  
  4000. #if HTS_POLL
  4001.       if ((opt->shell) || (opt->keyboard) || (opt->verbosedisplay) || (!opt->quiet)) {
  4002.         TStamp tl;
  4003.         *stre->info_shell_=1;
  4004.  
  4005.         /* Toggle with ENTER */
  4006.         if (!opt->quiet) {
  4007.           if (check_stdin()) {
  4008.             char com[256];
  4009.             linput(stdin,com,200);
  4010.             if (opt->verbosedisplay==2)
  4011.               opt->verbosedisplay=1;
  4012.             else
  4013.               opt->verbosedisplay=2;
  4014.             /* Info for wrappers */
  4015.             if ( (opt->debug>0) && (opt->log!=NULL) ) {
  4016.               HTS_LOG(opt,LOG_INFO); fprintf(opt->log,"engine: change-options"LF);
  4017.             }
  4018.             RUN_CALLBACK0(opt, chopt);
  4019.           }
  4020.         }
  4021.  
  4022.         tl=time_local();
  4023.  
  4024.         // gΘnΘrer un message d'infos sur l'Θtat actuel
  4025.         if (opt->shell) {    // si shell
  4026.           if ((tl-*stre->last_info_shell_)>0) {    // toute les 1 sec
  4027.             FILE* fp=stdout;
  4028.             int a=0;
  4029.             *stre->last_info_shell_=tl;
  4030.             if (fexist(fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_log),"hts-autopsy"))) {  // dΘbuggage: teste si le robot est vivant
  4031.               // (oui je sais un robot vivant.. mais bon.. il a le droit de vivre lui aussi)
  4032.               // (libΘrons les robots esclaves de l'internet!)
  4033.               remove(fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_log),"hts-autopsy"));
  4034.               fp=fopen(fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_log),"hts-isalive"),"wb");
  4035.               a=1;
  4036.             }
  4037.             if ((*stre->info_shell_) || a) {
  4038.               int i,j;
  4039.  
  4040.               fprintf(fp,"TIME %d"LF,(int) (tl-HTS_STAT.stat_timestart));
  4041.               fprintf(fp,"TOTAL %d"LF,(int) HTS_STAT.stat_bytes);
  4042.               fprintf(fp,"RATE %d"LF,(int) (HTS_STAT.HTS_TOTAL_RECV/(tl-HTS_STAT.stat_timestart)));
  4043.               fprintf(fp,"SOCKET %d"LF,back_nsoc(sback));
  4044.               fprintf(fp,"LINK %d"LF,lien_tot);
  4045.               {
  4046.                 LLint mem=0;
  4047.                 for(i=0;i<back_max;i++)
  4048.                   if (back[i].r.adr!=NULL)
  4049.                     mem+=back[i].r.size;
  4050.                 fprintf(fp,"INMEM "LLintP""LF,(LLint)mem);
  4051.               }
  4052.               for(j=0;j<2;j++) {  // passes pour ready et wait
  4053.                 for(i=0;i<back_max;i++) {
  4054.                   back_info(sback,i,j+1,stdout);    // maketrack_fp a la place de stdout ?? // **
  4055.                 }
  4056.               }
  4057.               fprintf(fp,LF);
  4058.               if (a)
  4059.                 fclose(fp);
  4060.               io_flush;
  4061.             }
  4062.           }
  4063.         }  // si shell
  4064.  
  4065.       }  // si shell ou keyboard (option)
  4066.       //
  4067. #endif
  4068.     } while((b>=0) && (back[max(b,0)].status>0));
  4069.  
  4070.  
  4071.     // If link not found on the stack, it's because it has already been downloaded
  4072.     // in background
  4073.     // Then, skip it and go to the next one
  4074.     if (b<0) {
  4075.       if ((opt->debug>1) && (opt->log!=NULL)) {
  4076.         HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"link #%d is ready, no more on the stack, skipping: %s%s.."LF,ptr,urladr,urlfil);
  4077.         test_flush;
  4078.       }
  4079.  
  4080.       // prochain lien
  4081.       // ptr++;
  4082.  
  4083.       return 2; // goto jump_if_done;
  4084.  
  4085.     }
  4086. #if 0
  4087.     /* FIXME - finalized HAS NO MORE THIS MEANING */
  4088.     /* link put in cache by the backing system for memory spare - reclaim */
  4089.     else if (back[b].finalized) {
  4090.       assertf(back[b].r.adr == NULL);
  4091.       /* read file in cache */
  4092.       back[b].r = cache_read_ro(opt,cache,back[b].url_adr,back[b].url_fil,back[b].url_sav, back[b].location_buffer);
  4093.       /* ensure correct location buffer set */
  4094.       back[b].r.location=back[b].location_buffer;
  4095.       if (back[b].r.statuscode == STATUSCODE_INVALID) {
  4096.         if (opt->log) {
  4097.           HTS_LOG(opt,LOG_ERROR); fprintf(opt->log,"Unexpected error: %s%s not found anymore in cache"LF,back[b].url_adr,back[b].url_fil);
  4098.           test_flush;
  4099.         }
  4100.       } else {
  4101.         if ( (opt->debug>1) && (opt->log!=NULL) ) {
  4102.           HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"reclaim file %s%s (%d)"LF,back[b].url_adr,back[b].url_fil,back[b].r.statuscode); test_flush;
  4103.         }
  4104.       }
  4105.     }
  4106. #endif
  4107.  
  4108.     if (!opt->verbosedisplay) {
  4109.       if (!opt->quiet) {
  4110.         static int roll=0;  /* static: ok */
  4111.         roll=(roll+1)%4;
  4112.         printf("%c\x0d",("/-\\|")[roll]);
  4113.         fflush(stdout);
  4114.       }
  4115.     } else if (opt->verbosedisplay==1) {
  4116.       if (b >= 0) {
  4117.         if (back[b].r.statuscode==HTTP_OK)
  4118.           printf("%d/%d: %s%s ("LLintP" bytes) - OK\33[K\r",ptr,lien_tot,back[b].url_adr,back[b].url_fil,(LLint)back[b].r.size);
  4119.         else
  4120.           printf("%d/%d: %s%s ("LLintP" bytes) - %d\33[K\r",ptr,lien_tot,back[b].url_adr,back[b].url_fil,(LLint)back[b].r.size,back[b].r.statuscode);
  4121.       } else {
  4122.         HTS_LOG(opt,LOG_ERROR); fprintf(opt->log,"Link disappeared");
  4123.       }
  4124.       fflush(stdout);
  4125.     }
  4126.     //}
  4127.  
  4128.         // ------------------------------------------------------------
  4129.     // VΘrificateur d'intΘgritΘ
  4130. #if DEBUG_CHECKINT
  4131.     _CHECKINT(&back[b],"Retour de back_wait, aprΦs le while")
  4132.     {
  4133.       int i;
  4134.       for(i=0;i<back_max;i++) {
  4135.         char si[256];
  4136.         sprintf(si,"Test global aprΦs back_wait, index %d",i);
  4137.         _CHECKINT(&back[i],si)
  4138.       }
  4139.     }
  4140. #endif
  4141.  
  4142.     // copier structure rΘponse htsblk
  4143.     if (b >= 0) {
  4144.       memcpy(r, &(back[b].r), sizeof(htsblk));
  4145.       r->location=stre->loc_;    // ne PAS copier location!! adresse, pas de buffer
  4146.       if (back[b].r.location) 
  4147.         strcpybuff(r->location,back[b].r.location);
  4148.       back[b].r.adr=NULL;    // ne pas faire de desalloc ensuite
  4149.  
  4150.       // libΘrer emplacement backing
  4151.       back_maydelete(opt,cache,sback,b);
  4152.     }
  4153.  
  4154.     // dΘbug graphique
  4155. #if BDEBUG==2
  4156.     {
  4157.       char s[12];
  4158.       int i=0;
  4159.       _GOTOXY(1,1);
  4160.       printf("Rate=%d B/sec\n",(int) (HTS_STAT.HTS_TOTAL_RECV/(time_local()-HTS_STAT.stat_timestart)));
  4161.       while(i<minimum(back_max,160)) {
  4162.         if (back[i].status>0) {
  4163.           sprintf(s,"%d",back[i].r.size);
  4164.         } else if (back[i].status==STATUS_READY) {
  4165.           strcpybuff(s,"ENDED");
  4166.         } else 
  4167.           strcpybuff(s,"   -   ");
  4168.         while(strlen(s)<8) strcatbuff(s," ");
  4169.         printf("%s",s); io_flush;
  4170.         i++;
  4171.       }
  4172.     }
  4173. #endif
  4174.  
  4175.  
  4176. #if BDEBUG==1
  4177.     printf("statuscode=%d with %s / msg=%s\n",r->statuscode,r->contenttype,r->msg);
  4178. #endif
  4179.  
  4180.   }
  4181.  
  4182.   ENGINE_SAVE_CONTEXT();
  4183.   return 0;
  4184. }
  4185.  
  4186. /* Wait for delayed types */
  4187. int hts_wait_delayed(htsmoduleStruct* str, 
  4188.                      char* adr, char* fil, char* save, 
  4189.                      char* parent_adr, char* parent_fil,
  4190.                      char* former_adr, char* former_fil, 
  4191.                      int* forbidden_url) {
  4192.   ENGINE_LOAD_CONTEXT_BASE();
  4193.   hash_struct* const hash = hashptr;
  4194.  
  4195.   int r_sv=0;
  4196.   int in_error = 0;
  4197.   LLint in_error_size = 0;
  4198.   char in_error_msg[32];
  4199.  
  4200.   // resolve unresolved type
  4201.   if (opt->savename_delayed != 0
  4202.     && *forbidden_url == 0 
  4203.     && IS_DELAYED_EXT(save) 
  4204.     && !opt->state.stop
  4205.     )
  4206.   {
  4207.     int loops;
  4208.     int continue_loop;
  4209.     if ((opt->debug>1) && (opt->log!=NULL)) {
  4210.       HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"Waiting for type to be known: %s%s"LF, adr, fil);
  4211.       test_flush;
  4212.     }
  4213.  
  4214.     /* Follow while type is unknown and redirects occurs */
  4215.     for( loops = 0, continue_loop = 1 ; IS_DELAYED_EXT(save) && continue_loop && loops < 7 ; loops++  ) {
  4216.       continue_loop = 0;
  4217.  
  4218.       /*
  4219.       Wait for an available slot 
  4220.       */
  4221.       WAIT_FOR_AVAILABLE_SOCKET();
  4222.  
  4223.       /* We can lookup directly in the cache to speedup this mess */
  4224.       if (opt->delayed_cached) {
  4225.                 lien_back back;
  4226.                 memset(&back, 0, sizeof(back));
  4227.                 back.r = cache_read(opt, cache, adr, fil, NULL, NULL);              // test uniquement
  4228.         if (back.r.statuscode == HTTP_OK && strnotempty(back.r.contenttype)) {      // cache found, and aswer is 'OK'
  4229.           if ((opt->debug>1) && (opt->log!=NULL)) {
  4230.             HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"Direct type lookup in cache (-%%D1): %s"LF, back.r.contenttype);
  4231.             test_flush;
  4232.           }
  4233.  
  4234.           /* Recompute filename with MIME type */
  4235.           save[0] = '\0';
  4236.           r_sv=url_savename(adr,fil,save,former_adr,former_fil,liens[ptr]->adr,liens[ptr]->fil,opt,liens,lien_tot,sback,cache,hash,ptr,numero_passe,&back);
  4237.  
  4238.           /* Recompute authorization with MIME type */
  4239.           {
  4240.             int new_forbidden_url = hts_acceptmime(opt, ptr, lien_tot, liens, adr,fil, back.r.contenttype);
  4241.             if (new_forbidden_url != -1) {
  4242.               if ((opt->debug>1) && (opt->log!=NULL)) {
  4243.                 HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"result for wizard mime test: %d"LF,new_forbidden_url);
  4244.                 test_flush;
  4245.               }
  4246.               if (new_forbidden_url == 1) {
  4247.                 *forbidden_url = new_forbidden_url;
  4248.                 if ((opt->debug>1) && (opt->log!=NULL)) {
  4249.                   HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"link forbidden because of MIME types restrictions: %s%s"LF, adr, fil);
  4250.                   test_flush;
  4251.                 }
  4252.                 break;        // exit loop
  4253.               }
  4254.             }
  4255.           }
  4256.  
  4257.           /* And exit loop */
  4258.           break;
  4259.         }
  4260.       }
  4261.  
  4262.       /* Check if the file was recorded already (necessary for redirects) */
  4263.       if (hash_read(hash,save,"",0,opt->urlhack) >= 0) {
  4264.         if (loops == 0) {   /* Should not happend */
  4265.           if ( opt->log!=NULL ) {
  4266.             HTS_LOG(opt,LOG_ERROR); fprintf(opt->log, "Duplicate entry in hts_wait_delayed() cancelled: %s%s -> %s"LF,adr,fil,save);
  4267.             test_flush;
  4268.           }
  4269.         }
  4270.         /* Exit loop (we're done) */
  4271.         continue_loop = 0;
  4272.         break ;
  4273.       }
  4274.  
  4275.       /* Add in backing (back_index() will respond correctly) */
  4276.       if (back_add_if_not_exists(sback,opt,cache,adr,fil,save,parent_adr,parent_fil,0) != -1) {
  4277.         int b;
  4278.         b=back_index(opt,sback,adr,fil,save); 
  4279.         if (b<0) {
  4280.           printf("PANIC! : Crash adding error, unexpected error found.. [%d]\n",__LINE__);
  4281.           XH_uninit;    // dΘsallocation mΘmoire & buffers
  4282.           return -1;
  4283.         }
  4284.  
  4285.         /* Cache read failed because file does not exists (bad delayed name!)
  4286.         Just re-add with the correct name, as we know the MIME now!
  4287.         */
  4288.         if (back[b].r.statuscode == STATUSCODE_INVALID && back[b].r.adr == NULL) {
  4289.                     lien_back delayed_back;
  4290.           //char BIGSTK delayed_ctype[128];
  4291.           // delayed_ctype[0] = '\0';
  4292.           // strncatbuff(delayed_ctype, back[b].r.contenttype, sizeof(delayed_ctype) - 1);    // copier content-type
  4293.                     back_copy_static(&back[b], &delayed_back);
  4294.  
  4295.           /* Delete entry */
  4296.           back_delete(opt,cache,sback,b);       // cancel
  4297.           b = -1;
  4298.  
  4299.           /* Recompute filename with MIME type */
  4300.           save[0] = '\0';
  4301.           r_sv=url_savename(adr,fil,save,former_adr,former_fil,liens[ptr]->adr,liens[ptr]->fil,opt,liens,lien_tot,sback,cache,hash,ptr,numero_passe,&delayed_back);
  4302.  
  4303.           /* Recompute authorization with MIME type */
  4304.           {
  4305.             int new_forbidden_url = hts_acceptmime(opt, ptr, lien_tot, liens, adr,fil, delayed_back.r.contenttype);
  4306.             if (new_forbidden_url != -1) {
  4307.               if ((opt->debug>1) && (opt->log!=NULL)) {
  4308.                 HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"result for wizard mime test: %d"LF,*forbidden_url);
  4309.                 test_flush;
  4310.               }
  4311.               if (new_forbidden_url == 1) {
  4312.                 *forbidden_url = new_forbidden_url;
  4313.                 if ((opt->debug>1) && (opt->log!=NULL)) {
  4314.                   HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"link forbidden because of MIME types restrictions: %s%s"LF, adr, fil);
  4315.                   test_flush;
  4316.                 }
  4317.                 break;        // exit loop
  4318.               }
  4319.             }
  4320.           }
  4321.  
  4322.           /* Re-Add wiht correct type */
  4323.           if (back_add_if_not_exists(sback,opt,cache,adr,fil,save,parent_adr,parent_fil,0) != -1) {
  4324.             b=back_index(opt,sback,adr,fil,save); 
  4325.           }
  4326.           if (b<0) {
  4327.             printf("PANIC! : Crash adding error, unexpected error found.. [%d]\n",__LINE__);
  4328.             XH_uninit;    // dΘsallocation mΘmoire & buffers
  4329.             return -1;
  4330.           }
  4331.           if ((opt->debug>1) && (opt->log!=NULL)) {
  4332.             HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"Type immediately loaded from cache: %s"LF, delayed_back.r.contenttype);
  4333.             test_flush;
  4334.           }
  4335.         }
  4336.  
  4337.         /* Wait for headers to be received */
  4338.         if (b >= 0) {
  4339.           back_set_locked(sback, b);    // Locked entry
  4340.         }
  4341.         do {
  4342.           if (b < 0)
  4343.             break;
  4344.  
  4345.           // temps α attendre, et remplir autant que l'on peut le cache (backing)
  4346.           if (back[b].status>0) {
  4347.             back_wait(sback,opt,cache,0);
  4348.           }
  4349.           if (ptr>=0) {
  4350.             back_fillmax(sback,opt,cache,liens,ptr,numero_passe,lien_tot);
  4351.           }
  4352.  
  4353.           // on est obligΘ d'appeler le shell pour le refresh..
  4354.           {
  4355.  
  4356.             // Transfer rate
  4357.             engine_stats();
  4358.  
  4359.             // Refresh various stats
  4360.             HTS_STAT.stat_nsocket=back_nsoc(sback);
  4361.             HTS_STAT.stat_errors=fspc(opt,NULL,"error");
  4362.             HTS_STAT.stat_warnings=fspc(opt,NULL,"warning");
  4363.             HTS_STAT.stat_infos=fspc(opt,NULL,"info");
  4364.             HTS_STAT.nbk=backlinks_done(sback,liens,lien_tot,ptr);
  4365.             HTS_STAT.nb=back_transfered(HTS_STAT.stat_bytes,sback);
  4366.  
  4367.             if (!RUN_CALLBACK7(opt, loop, sback->lnk, sback->count, b,ptr,lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) {
  4368.               return -1;
  4369.             } else if (opt->state._hts_cancel || !back_checkmirror(opt)) {    // cancel 2 ou 1 (cancel parsing)
  4370.               back_delete(opt,cache,sback,b);       // cancel test
  4371.               break;
  4372.             }
  4373.           }
  4374.         } while(
  4375.           /* dns/connect/request */ 
  4376.                     ( back[b].status >= 99 && back[b].status <= 101 )
  4377.           ||
  4378.           /* For redirects, wait for request to be terminated */
  4379.           ( HTTP_IS_REDIRECT(back[b].r.statuscode) && back[b].status > 0 )
  4380.           ||
  4381.           /* Same for errors */
  4382.           ( HTTP_IS_ERROR(back[b].r.statuscode) && back[b].status > 0 )
  4383.           );
  4384.         if (b >= 0) {
  4385.           back_set_unlocked(sback, b);    // Unlocked entry
  4386.         }
  4387.         /* ready (chunked) or ready (regular download) or ready (completed) */
  4388.  
  4389.         // Note: filename NOT in hashtable yet - liens_record will do it, with the correct ext!
  4390.         if (b >= 0) {
  4391.                     lien_back delayed_back;
  4392.           //char BIGSTK delayed_ctype[128];
  4393.           //delayed_ctype[0] = '\0';
  4394.           //strncatbuff(delayed_ctype, back[b].r.contenttype, sizeof(delayed_ctype) - 1);    // copier content-type
  4395.                     back_copy_static(&back[b], &delayed_back);
  4396.  
  4397.           /* Error */
  4398.           if (HTTP_IS_ERROR(back[b].r.statuscode))
  4399.           {
  4400.             /* seen as in error */
  4401.             in_error = back[b].r.statuscode;
  4402.             in_error_msg[0] = 0;
  4403.             strncat(in_error_msg, back[b].r.msg, sizeof(in_error_msg) - 1);
  4404.             in_error_size = back[b].r.totalsize;
  4405.             /* don't break, even with "don't take error pages" switch, because we need to process the slot anyway (and cache the error) */
  4406.           }
  4407.           /* Moved! */
  4408.           else if (HTTP_IS_REDIRECT(back[b].r.statuscode))
  4409.           {
  4410.             char BIGSTK mov_url[HTS_URLMAXSIZE*2];
  4411.             mov_url[0] = '\0';
  4412.             strcpybuff(mov_url, back[b].r.location);    // copier URL
  4413.  
  4414.             /* Remove (temporarily created) file if it was created */
  4415.             unlink(fconv(OPT_GET_BUFF(opt),back[b].url_sav));
  4416.  
  4417.             /* Remove slot! */
  4418.             if (back[b].status == STATUS_READY) {
  4419.               back_maydelete(opt, cache, sback, b);
  4420.             } else {    /* should not happend */
  4421.               back_delete(opt, cache, sback, b);
  4422.             }
  4423.             b = -1;
  4424.  
  4425.             /* Handle redirect */
  4426.             if ((int) strnotempty(mov_url)) {    // location existe!
  4427.               char BIGSTK mov_adr[HTS_URLMAXSIZE*2],mov_fil[HTS_URLMAXSIZE*2];
  4428.               mov_adr[0]=mov_fil[0]='\0';
  4429.               //
  4430.               if (ident_url_relatif(mov_url,adr,fil,mov_adr,mov_fil)>=0) {                        
  4431.                 if ((opt->debug>1) && (opt->log!=NULL)) {
  4432.                   HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"Redirect while resolving type: %s%s -> %s%s"LF, adr, fil, mov_adr, mov_fil);
  4433.                   test_flush;
  4434.                 }
  4435.                 // si non bouclage sur soi mΩme, ou si test avec GET non testΘ
  4436.                 if (strcmp(mov_adr,adr) != 0 || strcmp(mov_fil,fil) != 0) {
  4437.  
  4438.                   // recopier former_adr/fil?
  4439.                   if ((former_adr) && (former_fil)) {
  4440.                     if (strnotempty(former_adr)==0) {    // Pas dΘja notΘ
  4441.                       strcpybuff(former_adr,adr);
  4442.                       strcpybuff(former_fil,fil);
  4443.                     }
  4444.                   }
  4445.  
  4446.                   // check explicit forbidden - don't follow 3xx in this case
  4447.                   {
  4448.                     int set_prio_to=0;
  4449.                     if (hts_acceptlink(opt,ptr,lien_tot,liens,
  4450.                       mov_adr,mov_fil,
  4451.                       NULL, NULL,
  4452.                       &set_prio_to,
  4453.                       NULL) == 1) 
  4454.                     {  /* forbidden */
  4455.                       /* Note: the cache 'cached_tests' system will remember this error, and we'll only issue ONE request */
  4456.                       *forbidden_url = 1;          /* Forbidden! */
  4457.                       if ((opt->debug>1) && (opt->log!=NULL)) {
  4458.                         HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"link forbidden because of redirect beyond the mirror scope at %s%s -> %s%s"LF,adr,fil,mov_adr,mov_fil);
  4459.                         test_flush;
  4460.                       }
  4461.                       strcpybuff(adr,mov_adr);
  4462.                       strcpybuff(fil,mov_fil);
  4463.                       mov_url[0]='\0';
  4464.                       break;
  4465.                     }
  4466.                   }
  4467.  
  4468.                   // ftp: stop!
  4469.                   if (strfield(mov_url,"ftp://")
  4470. #if HTS_USEMMS
  4471.                                         || strfield(mov_url,"mms://")
  4472. #endif
  4473.                                         ) {
  4474.                     strcpybuff(adr,mov_adr);
  4475.                     strcpybuff(fil,mov_fil);
  4476.                     break;
  4477.                   }
  4478.  
  4479.                   /* ok, continue */
  4480.                   strcpybuff(adr,mov_adr);
  4481.                   strcpybuff(fil,mov_fil);
  4482.                   continue_loop = 1;
  4483.  
  4484.                   /* Recompute filename for hash lookup */
  4485.                   save[0] = '\0';
  4486.                   r_sv=url_savename(adr,fil,save,former_adr,former_fil,liens[ptr]->adr,liens[ptr]->fil,opt,liens,lien_tot,sback,cache,hash,ptr,numero_passe,&delayed_back);
  4487.                 } else {
  4488.                   if ( opt->log!=NULL ) {
  4489.                     HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"Unable to test %s%s (loop to same filename)"LF,adr,fil);
  4490.                     test_flush;
  4491.                   }
  4492.                 }  // loop to same location
  4493.               }  // ident_url_relatif()
  4494.             }  // location
  4495.           }  // redirect
  4496.           if ((opt->debug>1) && (opt->log!=NULL)) {
  4497.                         HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"Final type for %s%s: '%s'"LF, adr, fil, delayed_back.r.contenttype);
  4498.             test_flush;
  4499.           }
  4500.  
  4501.                     /* If we are done, do additional checks with final type and authorizations */
  4502.                     if (!continue_loop) {
  4503.                         /* Recompute filename with MIME type */
  4504.                         save[0] = '\0';
  4505.                         r_sv=url_savename(adr,fil,save,former_adr,former_fil,liens[ptr]->adr,liens[ptr]->fil,opt,liens,lien_tot,sback,cache,hash,ptr,numero_passe,&delayed_back);
  4506.  
  4507.                         /* Recompute authorization with MIME type */
  4508.                         {
  4509.                             int new_forbidden_url = hts_acceptmime(opt, ptr, lien_tot, liens, adr,fil, delayed_back.r.contenttype);
  4510.                             if (new_forbidden_url != -1) {
  4511.                                 if ((opt->debug>1) && (opt->log!=NULL)) {
  4512.                                     HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"result for wizard mime test: %d"LF,*forbidden_url);
  4513.                                     test_flush;
  4514.                                 }
  4515.                                 if (new_forbidden_url == 1) {
  4516.                                     *forbidden_url = new_forbidden_url;
  4517.                                     if ((opt->debug>1) && (opt->log!=NULL)) {
  4518.                                         HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"link forbidden because of MIME types restrictions: %s%s"LF, adr, fil);
  4519.                                         test_flush;
  4520.                                     }
  4521.                                     break;        // exit loop
  4522.                                 }
  4523.                             }
  4524.                         }
  4525.                     }
  4526.  
  4527.           /* Still have a back reference */
  4528.           if (b >= 0) {
  4529.             /* Finalize now as we have the type */
  4530.                         if (back[b].status == STATUS_READY) {
  4531.                             if (!back[b].finalized) {
  4532.                                 back_finalize(opt,cache,sback,b);
  4533.                             }
  4534.                         }
  4535.             /* Patch destination filename for direct-to-disk mode */
  4536.             strcpybuff(back[b].url_sav, save);
  4537.           }
  4538.  
  4539.         }  // b >= 0
  4540.       } else {
  4541.         printf("PANIC! : Crash adding error, unexpected error found.. [%d]\n",__LINE__);
  4542.         XH_uninit;    // dΘsallocation mΘmoire & buffers
  4543.         return -1;
  4544.       }
  4545.  
  4546.     } // while(IS_DELAYED_EXT(save))
  4547.  
  4548.     if (in_error != 0) {
  4549.       /* 'no error page' selected or file discarded by size rules! */
  4550.       if (!opt->errpage || ( in_error == STATUSCODE_TOO_BIG ) ) {
  4551.         /* Note: the cache 'cached_tests' system will remember this error, and we'll only issue ONE request */
  4552. #if 0
  4553.         /* No (3.43) - don't do that. We must not post-exclude an authorized link, because this will prevent the cache
  4554.         system from processing it, leading to refetch it endlessly. Just accept it, and handle the error as
  4555.         usual during parsing.
  4556.         */
  4557.         *forbidden_url = 1;          /* Forbidden! */
  4558. #endif
  4559.         if (opt->log != NULL && opt->debug > 0) {
  4560.           if (in_error == STATUSCODE_TOO_BIG) {
  4561.             HTS_LOG(opt, LOG_INFO); fprintf(opt->log,"link not taken because of its size (%d bytes) at %s%s"LF,(int)in_error_size,adr,fil);
  4562.           } else {
  4563.             HTS_LOG(opt, LOG_INFO); fprintf(opt->log,"link not taken because of error (%d '%s') at %s%s"LF,in_error,in_error_msg,adr,fil);
  4564.           }
  4565.           test_flush;
  4566.         }
  4567.       }
  4568.     }
  4569.  
  4570.     // error
  4571.     if (*forbidden_url != 1
  4572.       && IS_DELAYED_EXT(save)) {
  4573.       *forbidden_url = 1;
  4574.       if (opt->log!=NULL) {
  4575.         if (in_error) {
  4576.           HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"link in error (%d '%s'), type unknown, aborting: %s%s"LF, in_error, in_error_msg, adr, fil);
  4577.         } else {
  4578.           HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"link is probably looping, type unknown, aborting: %s%s"LF, adr, fil);
  4579.         }
  4580.         test_flush;
  4581.       }
  4582.     }
  4583.  
  4584.   }  // delayed type check ?
  4585.  
  4586.   ENGINE_SAVE_CONTEXT_BASE();
  4587.  
  4588.   return 0;
  4589. }
  4590.  
  4591.